In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold  
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
import xgboost as xgb

#### 读取数据

In [6]:
studentInfo = pd.read_csv('data_1.csv') # 修改相应的尾标
studentInfo.head()

Unnamed: 0.1,Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,module_presentation_length,number_TMA,number_CMA,number_Exam,number_all
0,0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,,268,5.0,0.0,1.0,6.0
1,1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,,268,5.0,0.0,1.0,6.0
2,2,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,,268,5.0,0.0,1.0,6.0
3,3,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,,268,5.0,0.0,1.0,6.0
4,4,AAA,2013J,38053,M,Wales,A Level or Equivalent,80-90%,35-55,0,60,N,Pass,-110.0,,268,5.0,0.0,1.0,6.0


In [3]:
def processData(studentInfo):
    infoData = studentInfo.copy()
    passAndfailMask = (infoData.final_result == 'Pass') | (infoData.final_result == 'Fail')
    infoData = infoData.loc[passAndfailMask,:]
    infoData.loc[:,"date_unregistration"] = infoData["date_unregistration"].fillna(0)
    unregistrationMask = (infoData.date_unregistration != 0)
    infoData.loc[unregistrationMask,"date_unregistration"] = 1
    infoData = infoData.dropna(subset=["imd_band"])

    education_mapping = {"HE Qualification": 1, "A Level or Equivalent": 2, "Lower Than A Level": 3, 
                         "Post Graduate Qualification": 4, "No Formal quals": 5}
    gender_mapping = {'M':0, 'F':1}
    age_mapping = {'0-35':1,'35-55':2,'55<=':3}
    disability_mapping = {'N':0, 'Y':1}
    module_mapping = {'AAA':1,'BBB':2,'CCC':3,'DDD':4,'EEE':5,'FFF':6,'GGG':7}
    season_mapping = dict(zip(['2013J','2014J','2013B','2014B'],[1,1,2,2]))
    presentation_mapping = dict(zip(['2013J','2014J','2013B','2014B'],[1,2,3,4]))
    imd_mapping = dict(zip(np.sort(infoData.imd_band.unique()), range(1,len(infoData.imd_band.unique()) + 1)))
    final_result_mapping = {'Pass':1, 'Fail':0}
    region_mapping = dict(zip(np.sort(infoData.region.unique()), range(1,len(infoData.region.unique()) + 1)))
    length_mapping = dict(zip(np.sort(infoData.module_presentation_length.unique()), range(1,len(infoData.module_presentation_length.unique()) + 1)))

    infoData['season'] = infoData['code_presentation'].map(season_mapping)
    infoData['have_try'] = (infoData['num_of_prev_attempts'] > 0).astype('uint8')
    infoData['gender'] = infoData['gender'].map(gender_mapping)
    infoData['highest_education'] = infoData['highest_education'].map(education_mapping)
    infoData['age_band'] = infoData['age_band'].map(age_mapping)
    infoData['disability'] = infoData['disability'].map(disability_mapping)
    infoData['code_module'] = infoData['code_module'].map(module_mapping)
    infoData['code_presentation'] = infoData['code_presentation'].map(presentation_mapping)
    infoData['imd_band'] = infoData['imd_band'].map(imd_mapping)
    infoData['final_result'] = infoData['final_result'].map(final_result_mapping)
    infoData['region'] = infoData['region'].map(region_mapping)
    infoData['module_presentation_length'] = infoData['module_presentation_length'].map(length_mapping)
    infoData = infoData.dropna()
    infoData.loc[(infoData.final_result == 1) & (infoData.date_unregistration == 1),'final_result'] = 0
    infoData = infoData.dropna()
    studentX = infoData.drop(['id_student','final_result'],axis=1)
    column = infoData.columns.tolist()
    column.remove('final_result')
    studentY = infoData.drop(column,axis=1)
    studentY = studentY.squeeze()
    return studentX,studentY

In [4]:
studentX, studentY = processData(studentInfo)
stkfold= StratifiedKFold(n_splits=5, shuffle=True)

def DecisionTree(studentX, studentY):
    accList = []
    print('Decision Tree -----> ',end="")
    accList.clear()
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]

        model = DecisionTreeClassifier(criterion='gini',splitter='random',class_weight='balanced', max_depth=1,min_impurity_decrease=0.00404040404040404)
        model.fit(Xtrain, ytrain)
        ypred = model.predict(Xtest)
        accList.append(metrics.accuracy_score(ypred, ytest))

    print(np.mean(accList))


def RandomForest(studentX, studentY):
    accList = []
    print('Random Forest -----> ',end="")
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]
        model = RandomForestClassifier(n_estimators=50,max_depth=12,min_impurity_decrease=0)
        model.fit(Xtrain, ytrain)
        ypred = model.predict(Xtest)
        accList.append(metrics.accuracy_score(ypred, ytest))
    print(np.mean(accList))

def Logistic(studentX, studentY):
    accList = []
    print('Logistic Regression -----> ',end="")
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]
        LRC = LogisticRegression(solver='lbfgs',max_iter=3000,C= 0.01, multi_class= 'ovr', penalty='l2')
        LRC.fit(Xtrain, ytrain)
        ypred = LRC.predict(Xtest)
        accList.append(metrics.accuracy_score(ypred, ytest))
    print(np.mean(accList))

def ExtraTree(studentX, studentY):
    accList = []
    print('ExtraTree -----> ',end="")
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]
        LRC = ExtraTreesClassifier(n_estimators=20, max_depth=10,min_impurity_decrease=0)
        LRC.fit(Xtrain, ytrain)
        ypred = LRC.predict(Xtest)
        accList.append(metrics.accuracy_score(ypred, ytest))
    print(np.mean(accList))
    
def Xgboost(studentX, studentY):
    accList = []
    print('Xgboost -----> ',end="")
    accList.clear()
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]
        LRC = xgb.XGBClassifier(n_estimators=28,max_depth=6,learning_rate=0.06,nthread=-1,silent=True)
        LRC.fit(Xtrain, ytrain)
        ypred = LRC.predict(Xtest)
        accList.append(metrics.accuracy_score(ypred, ytest))
    print(np.mean(accList))
    
def catboost(studentX, studentY):
    categorical_features_indices = np.arange(0, len(studentX.columns))
    accList = []
    print('Catboost -----> ',end="")
    for train_index,test_index in stkfold.split(studentX,studentY):
        Xtrain = studentX.iloc[train_index]
        ytrain = studentY.iloc[train_index]
        Xtest = studentX.iloc[test_index]
        ytest = studentY.iloc[test_index]
        params = {
            'iterations': 100,
            'learning_rate': 0.08,
            'eval_metric': 'Accuracy',
            'random_seed': 42,
            'logging_level': 'Silent',
            'use_best_model': False
        }
        train_pool = Pool(Xtrain, ytrain, cat_features=categorical_features_indices)
        validate_pool = Pool(Xtest, ytest, cat_features=categorical_features_indices)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=validate_pool)

        best_model_params = params.copy()
        best_model_params.update({
            'use_best_model': True
        })
        best_model = CatBoostClassifier(**best_model_params)
        best_model.fit(train_pool, eval_set=validate_pool);
        accList.append(metrics.accuracy_score(ytest, best_model.predict(Xtest)))
    print(np.mean(accList))

In [5]:
DecisionTree(studentX, studentY)
RandomForest(studentX, studentY)
Logistic(studentX, studentY)
ExtraTree(studentX, studentY)
Xgboost(studentX, studentY)
catboost(studentX, studentY)

Decision Tree -----> 0.6485418842730922
Random Forest -----> 0.6908514158096484
Logistic Regression -----> 0.6876973086569196
ExtraTree -----> 0.6871405500917493
Xgboost -----> 0.6923355705221237
Catboost -----> 0.6935425727962558
