In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix

In [135]:
#Daten Laden
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
#print(train_data.head())
#print(test_data.head())

In [136]:
#X und Y aus train

print("Rohdaten")
print("Train Zeilen (Pageviews):", train_data.shape[0])
print("Test Zeilen (Pageviews):", test_data.shape[0])
print("Unterschiedliche Paths im Train (roh):",
      train_data["path"].nunique())
print("Unterschiedliche Paths im Test (roh):",
      test_data["path"].nunique())

#Alle Paths pro User zusammenfassen
train_docs = train_data.groupby("user_id")["path"].apply(lambda x: " ".join(x))
test_docs = test_data.groupby("user_id")["path"].apply(lambda x: " ".join(x))

print("-------------------------")
print("nach Aggregation")
print("Train Docs (1 Zeile pro User):", train_docs.shape[0])
print("Test Docs (1 Zeile pro User):", test_docs.shape[0])

vec = CountVectorizer(min_df=5)

#Eingabevariable
X = vec.fit_transform(train_docs)
X_test = vec.transform(test_docs)
print("Paths als Features nach min_df=5:",
      len(vec.get_feature_names_out()))



#Zielvariable
y = train_data.groupby("user_id")["gender"].first()





Rohdaten
Train Zeilen (Pageviews): 2403279
Test Zeilen (Pageviews): 275840
Unterschiedliche Paths im Train (roh): 26394
Unterschiedliche Paths im Test (roh): 26392
-------------------------
nach Aggregation
Train Docs (1 Zeile pro User): 13513
Test Docs (1 Zeile pro User): 1487
Paths als Features nach min_df=5: 86101


In [137]:
#Feature Engineering


#Train

# manche User besuchen eine Seite öfter -> evlt. Kaufabsicht, andere Weniger vllt. auf der Website verirrt
total_visits = train_data.groupby("user_id").size()

# viele unique paths -> stöbern, wenige unique paths -> gezielte Kaufabsicht z.B. Schuhe
unique_paths = train_data.groupby("user_id")["path"].nunique()

# wie oft kommt der User zurück Pause > 30min
train_data["timestamp"] = pd.to_datetime(train_data["timestamp"], errors="coerce")
train_sorted = train_data.sort_values(["user_id", "timestamp"])
train_sorted["diff"] = train_sorted.groupby("user_id")["timestamp"].diff()
train_sorted["new_session"] = train_sorted["diff"] > pd.Timedelta(minutes=30)
num_sessions = train_sorted.groupby("user_id")["new_session"].sum()

# Reihenfolge an train_docs Index anpassen
total_visits = total_visits.reindex(train_docs.index, fill_value=0)
unique_paths = unique_paths.reindex(train_docs.index, fill_value=0)
num_sessions = num_sessions.reindex(train_docs.index, fill_value=0)

# Extra Feature Matrix TRAIN (n_users, 3)
X_extra = np.vstack([
    total_visits.values,
    unique_paths.values,
    num_sessions.values
]).T

# An X anhängen
X = hstack([X, csr_matrix(X_extra)]).tocsr()

#-----------------------------------------------------------------------------------------------------------------------------

#Test

total_visits_test = test_data.groupby("user_id").size()
unique_paths_test = test_data.groupby("user_id")["path"].nunique()

test_data["timestamp"] = pd.to_datetime(test_data["timestamp"], errors="coerce")
test_sorted = test_data.sort_values(["user_id", "timestamp"])

test_sorted["diff"] = test_sorted.groupby("user_id")["timestamp"].diff()
test_sorted["new_session"] = test_sorted["diff"] > pd.Timedelta(minutes=30)

num_sessions_test = test_sorted.groupby("user_id")["new_session"].sum()

# Reihenfolge an test_docs Index anpassen
total_visits_test = total_visits_test.reindex(test_docs.index, fill_value=0)
unique_paths_test = unique_paths_test.reindex(test_docs.index, fill_value=0)
num_sessions_test = num_sessions_test.reindex(test_docs.index, fill_value=0)

# Extra Feature Matrix TEST (n_users_test, 3)
X_test_extra = np.vstack([
    total_visits_test.values,
    unique_paths_test.values,
    num_sessions_test.values
]).T

# An X_test anhängen
X_test = hstack([X_test, csr_matrix(X_test_extra)]).tocsr()


# ---------- Check ----------

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)


X shape: (13513, 86104)
X_test shape: (1487, 86104)


In [138]:
#Validierung
#Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X,y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

#Modell definieren
model = LogisticRegression(max_iter=1000)
#CV
cv_scores = cross_val_score(
    model,
    X_train,
    y_train,
    cv = 5,
    scoring='accuracy',
)

print("durchschn. CV Genauigkeit (Training):", cv_scores.mean())


durchschn. CV Genauigkeit (Training): 0.9957446808510639


In [139]:
model.fit(X_train, y_train)
val_pred = model.predict(X_val)
print("Interne Validations Genauigkeit (Testing):", accuracy_score(y_val, val_pred))

Interne Validations Genauigkeit (Testing): 0.9959304476507584


In [140]:
#Finales Training auf allen Trainingsdaten
model.fit(X, y)
print("model expects:", model.n_features_in_)
print(X.shape, X_test.shape, model.n_features_in_)

model expects: 86104
(13513, 86104) (1487, 86104) 86104


In [141]:
#Vorhersage auf test.csv (ohne Labels)
predictions = model.predict(X_test)
print("Predictions für test.csv erstellt")

Predictions für test.csv erstellt


In [142]:
#Speichern

output = pd.DataFrame({
    "user_id": test_docs.index,
    "gender": predictions
})

output.to_csv("./data/predictions.csv", index=False)
print("Vorhersage gespeichert als predictions.csv")

Vorhersage gespeichert als predictions.csv
