##Binary Classification Project model

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [70]:
df = pd.read_csv('edx.csv', encoding='ISO-8859-1', low_memory=False)


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   course_id           641138 non-null  object 
 1   Course Short Title  471517 non-null  object 
 2   Course Long Title   641138 non-null  object 
 3   userid_DI           641138 non-null  object 
 4   registered          641138 non-null  int64  
 5   viewed              641138 non-null  int64  
 6   explored            641138 non-null  int64  
 7   certified           641138 non-null  int64  
 8   Country             641138 non-null  object 
 9   LoE_DI              535130 non-null  object 
 10  YoB                 544533 non-null  float64
 11  Age                 641138 non-null  object 
 12  gender              554332 non-null  object 
 13  grade               592766 non-null  object 
 14  start_time_DI       641138 non-null  object 
 15  last_event_DI       462184 non-nul

In [72]:
df.LoE_DI.isnull().sum()

106008

Preprocessing


In [73]:
#fill na level of education value with unknown
df['LoE_DI'] =  df['LoE_DI'].fillna("Unknown")
df.LoE_DI.value_counts()

Unnamed: 0_level_0,count
LoE_DI,Unnamed: 1_level_1
Bachelor's,219768
Secondary,169694
Master's,118189
Unknown,106008
Less than Secondary,14092
Doctorate,13387


In [74]:
#fix grade null values
df['grade'] = pd.to_numeric(df['grade'], errors='coerce')

df['grade'] = df['grade'].astype(float)


df['grade'] = df['grade'].fillna(0)



In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   course_id           641138 non-null  object 
 1   Course Short Title  471517 non-null  object 
 2   Course Long Title   641138 non-null  object 
 3   userid_DI           641138 non-null  object 
 4   registered          641138 non-null  int64  
 5   viewed              641138 non-null  int64  
 6   explored            641138 non-null  int64  
 7   certified           641138 non-null  int64  
 8   Country             641138 non-null  object 
 9   LoE_DI              641138 non-null  object 
 10  YoB                 544533 non-null  float64
 11  Age                 641138 non-null  object 
 12  gender              554332 non-null  object 
 13  grade               641138 non-null  float64
 14  start_time_DI       641138 non-null  object 
 15  last_event_DI       462184 non-nul

In [76]:
#fix event, videos, chapters columns
# nan values assume no progess or no chapters/videos done so thats why they are filled to 0
df[['nevents','ndays_act','nplay_video','nchapters']] = df[['nevents','ndays_act','nplay_video','nchapters']].fillna(0)

In [77]:
#fix genders

df['gender'] = df['gender'].fillna("not specified")

In [78]:
df.head()

Unnamed: 0,course_id,Course Short Title,Course Long Title,userid_DI,registered,viewed,explored,certified,Country,LoE_DI,...,grade,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,roles,incomplete_flag
0,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130442623,1,0,0,0,United States,Unknown,...,0.0,19/12/2012,17/11/2013,0.0,9.0,0.0,0.0,0,,1.0
1,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130275857,1,0,0,0,United States,Unknown,...,0.0,08/02/2013,17/11/2013,0.0,16.0,0.0,0.0,0,,1.0
2,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130539455,1,1,0,0,France,Unknown,...,0.0,01/01/2013,14/05/2013,42.0,6.0,0.0,3.0,0,,
3,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130088379,1,1,0,0,United States,Unknown,...,0.0,18/02/2013,17/03/2013,70.0,3.0,0.0,3.0,0,,
4,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130024894,1,1,0,0,United States,Unknown,...,0.07,24/01/2013,03/08/2013,175.0,9.0,0.0,7.0,0,,


In [79]:
#make date time columns have some relevancy

df['engagement_duration'] = (
    pd.to_datetime(df['last_event_DI'], dayfirst=True) -
    pd.to_datetime(df['start_time_DI'], dayfirst=True)
).dt.days

df['engagement_duration'].fillna(0)


Unnamed: 0,engagement_duration
0,333.0
1,282.0
2,133.0
3,27.0
4,191.0
...,...
641133,0.0
641134,0.0
641135,0.0
641136,0.0


In [80]:
df['engagement_duration'] = df['engagement_duration'].fillna(0)

In [81]:
df.head()

Unnamed: 0,course_id,Course Short Title,Course Long Title,userid_DI,registered,viewed,explored,certified,Country,LoE_DI,...,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,roles,incomplete_flag,engagement_duration
0,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130442623,1,0,0,0,United States,Unknown,...,19/12/2012,17/11/2013,0.0,9.0,0.0,0.0,0,,1.0,333.0
1,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130275857,1,0,0,0,United States,Unknown,...,08/02/2013,17/11/2013,0.0,16.0,0.0,0.0,0,,1.0,282.0
2,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130539455,1,1,0,0,France,Unknown,...,01/01/2013,14/05/2013,42.0,6.0,0.0,3.0,0,,,133.0
3,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130088379,1,1,0,0,United States,Unknown,...,18/02/2013,17/03/2013,70.0,3.0,0.0,3.0,0,,,27.0
4,HarvardX/CB22x/2013_Spring,HeroesX,The Ancient Greek Hero,MHxPC130024894,1,1,0,0,United States,Unknown,...,24/01/2013,03/08/2013,175.0,9.0,0.0,7.0,0,,,191.0


Drop columns

In [82]:
df = df.drop(columns=['roles', 'incomplete_flag', 'YoB', 'start_time_DI', 'last_event_DI'])

In [83]:
df = df.drop(columns=['course_id','Course Short Title','userid_DI', 'Course Long Title','Age','Country'])

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   registered           641138 non-null  int64  
 1   viewed               641138 non-null  int64  
 2   explored             641138 non-null  int64  
 3   certified            641138 non-null  int64  
 4   LoE_DI               641138 non-null  object 
 5   gender               641138 non-null  object 
 6   grade                641138 non-null  float64
 7   nevents              641138 non-null  float64
 8   ndays_act            641138 non-null  float64
 9   nplay_video          641138 non-null  float64
 10  nchapters            641138 non-null  float64
 11  nforum_posts         641138 non-null  int64  
 12  engagement_duration  641138 non-null  float64
dtypes: float64(6), int64(5), object(2)
memory usage: 63.6+ MB


create dummy variables

In [85]:
df = pd.get_dummies(df, columns=['LoE_DI', 'gender'])

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   registered                  641138 non-null  int64  
 1   viewed                      641138 non-null  int64  
 2   explored                    641138 non-null  int64  
 3   certified                   641138 non-null  int64  
 4   grade                       641138 non-null  float64
 5   nevents                     641138 non-null  float64
 6   ndays_act                   641138 non-null  float64
 7   nplay_video                 641138 non-null  float64
 8   nchapters                   641138 non-null  float64
 9   nforum_posts                641138 non-null  int64  
 10  engagement_duration         641138 non-null  float64
 11  LoE_DI_Bachelor's           641138 non-null  bool   
 12  LoE_DI_Doctorate            641138 non-null  bool   
 13  LoE_DI_Less th

Create train and test set

In [87]:

X = df.drop(columns=['certified'])
y = df['certified']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Building

In [88]:
#Random Forest Classifier model
rfc_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

In [89]:
#accuracy of random forest
accuracy_score(y_test, rfc_pred)

0.9988380072994978

In [90]:
#XGBoost Classifier model
xgb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [91]:
#accuracy of xgboost
accuracy_score(y_test, xgb_pred)

0.9987444239947594

In [92]:
#Logistic Regression Classifier model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
#accuracy of logistic regression model
accuracy_score(y_test, lr_pred)

0.9850656642854915