<a href="https://colab.research.google.com/github/Froztgal/Stepic_ML_Contest/blob/main/stepic_ml_contest_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Описание данных

---


[events_train.csv](https://stepik.org/media/attachments/course/4852/event_data_train.zip) - данные о действиях, которые совершают студенты со стэпами


---


* step_id - id стэпа
* user_id - анонимизированный id юзера
* timestamp - время наступления события в формате unix date
* action - событие, возможные значения: 
  * discovered - пользователь перешел на стэп
  * viewed - просмотр шага,
  * started_attempt - начало попытки решить шаг, ранее нужно было явно нажать на кнопку - начать решение, перед тем как приступить к решению практического шага
  * passed - удачное решение практического шага


---


[submissions_train.csv](https://stepik.org/media/attachments/course/4852/submissions_data_train.zip) - данные о времени и статусах сабмитов к практическим заданиям


---


* step_id - id стэпа
* timestamp - время отправки решения в формате unix date
* submission_status - статус решения
* user_id - анонимизированный id юзера

---

# Imports and settings

---

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from graphviz import Source
from IPython.display import SVG
from IPython.display import HTML
from IPython.display import display

In [None]:
%matplotlib inline
sns.set(rc={"figure.figsize": (20, 10)})
sns.set(font_scale=1.5)

style = "<style>svg{width:70% !important; height:70% !important;}</style>"
HTML(style)

---

# Functions and methods

---

In [None]:
def add_date(dataframe):
  
  new_df = dataframe
  new_df['date_clock'] = pd.to_datetime(new_df['timestamp'], unit='s')
  new_df['date'] = new_df['date_clock'].dt.date
  
  return new_df

In [None]:
def get_all_users(events):

  users_id = events["user_id"].unique()
  users_id.sort()
  users_id = pd.DataFrame(users_id, columns=["user_id"])

  return users_id

In [None]:
def get_pivot_table(dataframe, columns, index="user_id", values="step_id", aggfunc="count", fill_value=0):
  
  new_df = dataframe
  new_df = new_df.pivot_table(index=index,
                              columns=columns,
                              values=values,
                              aggfunc=aggfunc,
                              fill_value=fill_value).reset_index()
  
  return new_df

In [None]:
def get_scores(submissions, users_id):

  scores = get_pivot_table(submissions, "submission_status")
  scores_dataframe = users_id.merge(scores, on='user_id', how='outer')
  scores_dataframe = scores_dataframe.fillna(0)

  return scores_dataframe

In [None]:
def add_pass_mark(train_dataframe, treshold):
  
  new_df = train_dataframe
  new_df['passed_course'] = new_df.correct > treshold

  return new_df

In [None]:
def get_filtering_timestamp(events, treshold):

  users_start_time = events.groupby("user_id", as_index=False) \
  .agg({"timestamp": "min"}) \
  .rename({"timestamp": "first_timestamp"}, axis=1)
  
  users_start_time["user_learning_time_treshold"] = \
  users_start_time.user_id.map(str) + "_" + \
  (users_start_time.first_timestamp + treshold).map(str)

  users_start_time = users_start_time.drop(columns=["first_timestamp"], axis=1)

  return users_start_time

In [None]:
def get_time_features(events):

  time_features = events.groupby("user_id", as_index=False) \
  .agg({"timestamp": "min"}) \
  .rename({"timestamp": "start_timestamp"}, axis=1)

  time_features['date_clock'] = pd.to_datetime(time_features['start_timestamp'], unit='s')
  time_features['start_year'] = time_features['date_clock'].dt.year
  time_features['start_quarter'] = time_features['date_clock'].dt.quarter
  time_features['start_month'] = time_features['date_clock'].dt.month
  time_features['start_week'] = time_features['date_clock'].dt.isocalendar().week
  time_features['start_day'] = time_features['date_clock'].dt.day
  time_features['start_day_of_week'] = time_features['date_clock'].dt.weekday
  time_features['start_hour'] = time_features['date_clock'].dt.hour

  time_features = time_features.drop(columns=["start_timestamp", "date_clock"], axis=1)

  return time_features

In [None]:
def filter_by_time(dataframe, users_start_time):

  new_df = dataframe
  new_df["user_time"] = new_df.user_id.map(str) + "_" + new_df.timestamp.map(str)
  new_df = new_df.merge(users_start_time, on="user_id", how="outer")
  new_df = new_df[new_df.user_time <= new_df.user_learning_time_treshold]

  return new_df

In [None]:
def get_steps_tried(train_submissions):

  steps_tried = train_submissions.groupby("user_id", as_index=False). \
   step_id.nunique().rename(columns={"step_id": "steps_tried"})

  return steps_tried

In [None]:
def get_unique_days(dataframe):

  days = dataframe.groupby('user_id').date.nunique().to_frame().reset_index()

  return days

In [None]:
def get_x_y_train(events, submissions, treshold):
  
  # Копируем датафреймы
  new_events = events
  new_submissions = submissions
  
  # добавляем дату и время из временных меток
  new_events = add_date(new_events)
  new_submissions = add_date(new_submissions)

  # Получаем балы пользователей и помечаем тех кто прошел курс
  users_id = get_all_users(new_events)
  marked_dataframe = get_scores(new_submissions, users_id)
  marked_dataframe = add_pass_mark(marked_dataframe, 40) # 40 балов (курс пройден)

  # Получаем время начала курса каждым пользователем и фильтруем записи по порогу времени из условия
  users_start_time = get_filtering_timestamp(new_events, treshold)
  event_data_train = filter_by_time(new_events, users_start_time)
  submission_data_train = filter_by_time(new_submissions, users_start_time)

  # Получаем количество попыток пользователей решить задания
  steps_tried = get_steps_tried(submission_data_train)

  # Получаем количество различных действий пользователей
  actions = get_pivot_table(event_data_train, "action")
  status = get_pivot_table(submission_data_train, "submission_status")

  # Получаем количесвто уникальных дней пользователей и временные фичи
  time_features = get_time_features(new_events)
  user_days_events = get_unique_days(event_data_train)
  user_days_submissions = get_unique_days(submission_data_train)

  # Создаем X_train
  X = steps_tried
  X = X.merge(status, on="user_id", how="outer")
  X = X.merge(actions, on="user_id", how="outer")
  X = X.merge(marked_dataframe[["user_id", "passed_course"]], on="user_id", how="outer")
  X = X.merge(user_days_events, on="user_id", how="outer").rename({"date": "e_days"}, axis=1)
  X = X.merge(user_days_submissions, on="user_id", how="outer").rename({"date": "s_days"}, axis=1)

  # Additional features
  X = X.merge(time_features, on="user_id", how="outer")
  
  # Создаем y_train
  y = X.passed_course
  y = y.map(int)

  # Убираем лишние данные из X_train и заполняем NaN
  X = X.fillna(0)
  z = X
  X = X.drop(["passed_course"], axis=1)
  X = X.set_index(X.user_id).drop("user_id", axis=1)

  return X, y, z

In [None]:
def get_x_pred(events, submissions):

  # Копируем датафреймы
  new_events = events
  new_submissions = submissions
  
  # Добавляем дату и время из временных меток
  new_events = add_date(new_events)
  new_submissions = add_date(new_submissions)

  # Получаем количество попыток пользователей решить задания
  steps_tried = get_steps_tried(new_submissions)

  # Получаем количество различных действий пользователей
  actions = get_pivot_table(new_events, "action")
  status = get_pivot_table(new_submissions, "submission_status")

  # Получаем количесвто уникальных дней пользователей и временные фичи
  time_features = get_time_features(new_events)
  user_days_events = get_unique_days(new_events)
  user_days_submissions = get_unique_days(new_submissions)

  # Создаем X_pred
  X = steps_tried
  X = X.merge(status, on="user_id", how="outer")
  X = X.merge(actions, on="user_id", how="outer")
  X = X.merge(user_days_events, on="user_id", how="outer").rename({"date": "e_days"}, axis=1)
  X = X.merge(user_days_submissions, on="user_id", how="outer").rename({"date": "s_days"}, axis=1)

  # Additional features
  X = X.merge(time_features, on="user_id", how="outer")

  # Убираем лишние данные из X_pred и заполняем NaN
  X = X.fillna(0)
  X = X.set_index(X.user_id).drop("user_id", axis=1)
  X = X.sort_index()

  return X

In [None]:
def exploratory_data_analys(z, lim=10):
  sns.heatmap(z.drop("user_id", axis=1).corr(), annot=True, fmt=".1f")
  for col in z.columns:
    if col not in ["passed_course", "user_id"]:
      plt.figure(col)
      if z[col].nunique() > lim:
        sns.lineplot(data=z, x=z[col].map(float), y=z.passed_course)
      else:
        tmp = z.passed_course.map(int)
        sns.barplot(data=z, x=z[col].map(float), y=tmp)

In [None]:
def rfc(X_train, X_test, y_train, y_test, scaler, cv=5):

  params = {
    "n_estimators": range(10, 1000, 10),
    "criterion": ["gini", "entropy"],
    "max_depth": range(5, 100, 5),
    "min_samples_leaf": range(5, 100, 5),
    "min_samples_split": range(5, 100, 5),
    "max_features": ["auto", "sqrt", "log2"],
    "class_weight": ["balanced", "balanced_subsample"],
    "bootstrap": [True, False]
  }

  rtc = RandomForestClassifier(n_jobs=-1, random_state=42)
  clf = RandomizedSearchCV(rtc, cv=cv, scoring="roc_auc", param_distributions=params, n_jobs=-1)
  pipeline = make_pipeline(scaler, clf)
  pipeline.fit(X_train, y_train)

  train_score = pipeline.score(X_train, y_train)
  test_score = pipeline.score(X_test, y_test)

  return pipeline, train_score, test_score

In [None]:
def lrc(X_train, X_test, y_train, y_test, scaler, cv=5):

  params = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": np.linspace(0.1, 10, 100),
    "fit_intercept": [True, False],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": range(100, 1000, 10),
    "class_weight": ["balanced", None]
  }

  lrc = LogisticRegression(n_jobs=-1, random_state=42)
  clf = RandomizedSearchCV(lrc, cv=cv, scoring="roc_auc", param_distributions=params, n_jobs=-1)
  pipeline = make_pipeline(scaler, clf)
  pipeline.fit(X_train, y_train)

  train_score = pipeline.score(X_train, y_train)
  test_score = pipeline.score(X_test, y_test)

  return pipeline, train_score, test_score

In [None]:
def svc(X_train, X_test, y_train, y_test, scaler, cv=5):

  params = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": np.linspace(0.1, 10, 100),
    "shrinking": [True, False],
    "probability": [True, False],
    "gamma": ["scale", "auto"],
    "max_iter": range(100, 1000, 10),
    "class_weight": ["balanced", None]
  }

  svc = SVC(random_state=42, probability=True)
  clf = RandomizedSearchCV(svc, cv=cv, scoring="roc_auc", param_distributions=params, n_jobs=-1)
  pipeline = make_pipeline(scaler, clf)
  pipeline.fit(X_train, y_train)

  train_score = pipeline.score(X_train, y_train)
  test_score = pipeline.score(X_test, y_test)

  return pipeline, train_score, test_score

In [None]:
def dtc(X_train, X_test, y_train, y_test, scaler, cv=5):

  params = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": range(2, 100, 1),
    "min_samples_split": range(2, 100, 1),
    "min_samples_leaf": range(2, 100, 1),
    "max_features": ["auto", "sqrt", "log2"],
    "class_weight": ["balanced", None]
  }

  dtc = DecisionTreeClassifier(random_state=42)
  clf = RandomizedSearchCV(dtc, cv=cv, scoring="roc_auc", param_distributions=params, n_jobs=-1)
  pipeline = make_pipeline(scaler, clf)
  pipeline.fit(X_train, y_train)

  train_score = pipeline.score(X_train, y_train)
  test_score = pipeline.score(X_test, y_test)

  return pipeline, train_score, test_score

In [None]:
def nbc(X_train, X_test, y_train, y_test, scaler):

  nbc = GaussianNB()
  pipeline = make_pipeline(scaler, nbc)
  pipeline.fit(X_train, y_train)

  train_score = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:,1])
  test_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1])

  return pipeline, train_score, test_score

In [None]:
def gbc(X_train, X_test, y_train, y_test, scaler):

  gbc = GradientBoostingClassifier()
  pipeline = make_pipeline(scaler, gbc)
  pipeline.fit(X_train, y_train)

  train_score = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:,1])
  test_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1])

  return pipeline, train_score, test_score

In [None]:
def nnc(X_train, X_test, y_train, y_test, scaler):

  mplc = MLPClassifier(hidden_layer_sizes=(16, 6), learning_rate="adaptive", activation="tanh", max_iter=1000)
  pipeline = make_pipeline(scaler, mplc)
  pipeline.fit(X_train, y_train)

  train_score = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:,1])
  test_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1])

  return pipeline, train_score, test_score

По условию, мы должны предсказать используя данные за первые два дня.

In [None]:
learning_time_treshold = 2 * 24 * 60 * 60 # 2 days in seconds

In [None]:
train_events_data = pd.read_csv("https://stepik.org/media/attachments/course/4852/event_data_train.zip")

In [None]:
train_submission_data = pd.read_csv("https://stepik.org/media/attachments/course/4852/submissions_data_train.zip")

In [None]:
X, y, z = get_x_y_train(train_events_data, train_submission_data, learning_time_treshold)

In [None]:
# X.sort_index().head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# exploratory_data_analys(z)

---

# Подготовка данных для предсказания

---

In [None]:
pred_events_data = pd.read_csv("https://stepik.org/media/attachments/course/4852/events_data_test.csv")

In [None]:
pred_submission_data = pd.read_csv("https://stepik.org/media/attachments/course/4852/submission_data_test.csv")

In [None]:
X_pred = get_x_pred(pred_events_data, pred_submission_data)

In [None]:
# X_pred.sort_index().head()

---

# Random Forest

---

In [None]:
scaler = StandardScaler()

In [None]:
pipe_rfc, train_score, test_score = rfc(X_train, X_test, y_train, y_test, scaler)

In [None]:
res_df = pd.DataFrame({"Classifier": ["Random Forest"], "train_score": [train_score], "test_score": [test_score]})

## Random Forest Feature Importance

In [None]:
feature_imp = pd.Series(pipe_rfc[1].best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False)
fi_df = pd.DataFrame(feature_imp, columns=["importance"]).reset_index()
fi_df = fi_df.rename(columns={"index": "feature"})

In [None]:
# sns.barplot(x=feature_imp, y=feature_imp.index)

---

# Logistic Regression

---

In [None]:
pipe_lrc, train_score, test_score = lrc(X_train, X_test, y_train, y_test, scaler)

        nan 0.8328284  0.83407168        nan]


In [None]:
res_df = res_df.append({"Classifier": "Logistic Regression",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# Naive Bayes

---

In [None]:
pipe_nbc, train_score, test_score = nbc(X_train, X_test, y_train, y_test, scaler)

In [None]:
res_df = res_df.append({"Classifier": "Naive Bayes",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# C-Support Vector Classification

---

In [None]:
pipe_svc, train_score, test_score = svc(X_train, X_test, y_train, y_test, scaler)



In [None]:
res_df = res_df.append({"Classifier": "Support Vector Classification",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# Decision Tree

---

In [None]:
pipe_dtc, train_score, test_score = dtc(X_train, X_test, y_train, y_test, scaler)

In [None]:
res_df = res_df.append({"Classifier": "Decision Tree",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# Gradient Boosting

---

In [None]:
pipe_gbc, train_score, test_score = gbc(X_train, X_test, y_train, y_test, scaler)

In [None]:
res_df = res_df.append({"Classifier": "Gradient Boosting",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# Neural Network

---

In [None]:
pipe_nnc, train_score, test_score = nnc(X_train, X_test, y_train, y_test, scaler)

In [None]:
res_df = res_df.append({"Classifier": "Neural Network",
                        "train_score": train_score, "test_score": test_score}, ignore_index=True)

---

# Predictions

---

In [None]:
pipes = [pipe_gbc, pipe_rfc, pipe_lrc, pipe_nnc, pipe_dtc, pipe_nbc]
labels = ["pipe_gbc", "pipe_rfc", "pipe_lrc", "pipe_nnc", "pipe_dtc", "pipe_nbc"]

In [None]:
for i in range(len(pipes)):
  y_pred = pipes[i].predict_proba(X_pred)
  save_df = pd.DataFrame({"user_id": X_pred.index, "is_gone": y_pred[:, 1]})
  save_df.to_csv(labels[i], index=False)

Stepic testing
  * DTC - 0.8188757872707935
  * GBC - 0.8859789381103069
  * LRC - 0.8768111830197085
  * NBC - 0.8699781774801457
  * NNC - 0.7761337425604662
  * RFC - 0.8663461009091422

Лучший достигнутый результат получен при использовании Gradient Boosting (Top 30).