In [248]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# import train-test split 
from sklearn.model_selection import train_test_split

# import functions to perform feature selection
#from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.feature_selection import RFE
from mlxtend .feature_selection import SequentialFeatureSelector as sfs

# import functions to perform cross validation
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

#Plotting Parameters
plt.rcParams['figure.figsize'] = [15,8]

In [249]:
train = pd.read_csv("C:/Users/91899/Desktop/Dockship.io/Exam Mark Prediction/exam_mark_prediction_ai_challenge-dataset/train.csv", index_col="Unnamed: 0")
test = pd.read_csv("C:/Users/91899/Desktop/Dockship.io/Exam Mark Prediction/exam_mark_prediction_ai_challenge-dataset/test.csv", index_col="Unnamed: 0")

In [250]:
train.head()

Unnamed: 0,gender,ethnicity,parental level of education,lunch,test preparation course,reading score,writing score,math score
0,male,group C,some college,standard,none,61,62,61
1,female,group C,associate's degree,standard,none,62,53,53
2,female,group C,some college,free/reduced,completed,75,70,67
3,male,group C,some high school,free/reduced,none,76,65,79
4,male,group A,high school,free/reduced,none,58,44,53


In [251]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 699
Data columns (total 8 columns):
gender                         700 non-null object
ethnicity                      700 non-null object
parental level of education    700 non-null object
lunch                          700 non-null object
test preparation course        700 non-null object
reading score                  700 non-null int64
writing score                  700 non-null int64
math score                     700 non-null int64
dtypes: int64(3), object(5)
memory usage: 49.2+ KB


In [252]:
train.describe()

Unnamed: 0,reading score,writing score,math score
count,700.0,700.0,700.0
mean,69.751429,68.497143,66.442857
std,14.600877,15.224265,15.249227
min,23.0,15.0,8.0
25%,60.0,58.0,57.0
50%,70.0,70.0,66.5
75%,80.0,79.0,77.0
max,100.0,100.0,100.0


In [253]:
train.isnull().sum()

gender                         0
ethnicity                      0
parental level of education    0
lunch                          0
test preparation course        0
reading score                  0
writing score                  0
math score                     0
dtype: int64

In [254]:
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)

['gender', 'ethnicity', 'parental level of education', 'lunch', 'test preparation course']


In [255]:
def value_count():
    for key, value in train.iteritems():
        print("Column Name: ",key)
        print(value.value_counts()) 
        print("-------------------------------------------") 

In [256]:
value_count()

Column Name:  gender
female    367
male      333
Name: gender, dtype: int64
-------------------------------------------
Column Name:  ethnicity
group C    219
group D    184
group B    141
group E     98
group A     58
Name: ethnicity, dtype: int64
-------------------------------------------
Column Name:  parental level of education
some college          175
associate's degree    138
high school           134
some high school      129
bachelor's degree      81
master's degree        43
Name: parental level of education, dtype: int64
-------------------------------------------
Column Name:  lunch
standard        454
free/reduced    246
Name: lunch, dtype: int64
-------------------------------------------
Column Name:  test preparation course
none         456
completed    244
Name: test preparation course, dtype: int64
-------------------------------------------
Column Name:  reading score
72    26
64    24
66    23
84    22
74    22
      ..
37     1
32     1
29     1
28     1
23     1


In [257]:
train["reading score"].unique()

array([ 61,  62,  75,  76,  58,  82,  51,  71,  52,  67,  96,  65,  70,
        89,  53,  42,  77,  72,  60,  95,  66,  68,  49,  78,  92,  80,
        93,  81,  85,  94,  84,  39,  83,  46,  79,  55,  59,  64,  57,
        97,  86,  44,  90,  54,  99,  74,  56,  63,  32,  45,  87, 100,
        48,  24,  73,  47,  69,  28,  91,  41,  43,  34,  50,  88,  38,
        37,  31,  29,  23,  40], dtype=int64)

In [258]:
train.head()

Unnamed: 0,gender,ethnicity,parental level of education,lunch,test preparation course,reading score,writing score,math score
0,male,group C,some college,standard,none,61,62,61
1,female,group C,associate's degree,standard,none,62,53,53
2,female,group C,some college,free/reduced,completed,75,70,67
3,male,group C,some high school,free/reduced,none,76,65,79
4,male,group A,high school,free/reduced,none,58,44,53


In [259]:
college = "some college"
degree = "associate's degree"
high = "some high school"
def education(x):
    if(x in college):
        return("Junior College")
    elif(x in degree):
        return("bachelor's degree")
    elif(x in high):
        return("high school")
    
train["Education"] = train["parental level of education"].apply(education)

In [260]:
test["Education"] = test["parental level of education"].apply(education)

In [261]:
train["Education"].value_counts()

high school          263
Junior College       175
bachelor's degree    138
Name: Education, dtype: int64

In [262]:
train.drop(["parental level of education", "reading score", "writing score"], axis = 1, inplace = True)

In [263]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
train["ethnicity"] = lb.fit_transform(train["ethnicity"])

In [264]:
test["ethnicity"] = lb.fit_transform(test["ethnicity"])

In [265]:
train["gender"] = lb.fit_transform(train["gender"])
test["gender"] = lb.fit_transform(test["gender"])

In [266]:
train["Education"] = lb.fit_transform(train["Education"])
test["Education"] = lb.fit_transform(test["Education"])

In [267]:
train["lunch"] = lb.fit_transform(train["lunch"])
test["lunch"] = lb.fit_transform(test["lunch"])

In [268]:
train["test preparation course"] = lb.fit_transform(train["test preparation course"])
test["test preparation course"] = lb.fit_transform(test["test preparation course"])

In [269]:
train.head()

Unnamed: 0,gender,ethnicity,lunch,test preparation course,math score,Education
0,1,2,1,1,61,0
1,0,2,1,1,53,1
2,0,2,0,0,67,0
3,1,2,0,1,79,2
4,1,0,0,1,53,2


In [270]:
from sklearn.preprocessing import PowerTransformer

# power transform the raw data
# power = PowerTransformer(method='yeo-johnson', standardize=True)
# train["reading score"] = power.fit_transform(train[["reading score"]])
# test["reading score"] = power.fit_transform(test[["reading score"]])

In [271]:
train["writing score"] = power.fit_transform(train[["writing score"]])
test["writing score"] = power.fit_transform(test[["writing score"]])

KeyError: "None of [Index(['writing score'], dtype='object')] are in the [columns]"

In [272]:
train.head()

Unnamed: 0,gender,ethnicity,lunch,test preparation course,math score,Education
0,1,2,1,1,61,0
1,0,2,1,1,53,1
2,0,2,0,0,67,0
3,1,2,0,1,79,2
4,1,0,0,1,53,2


In [273]:
X = train.drop("math score", axis = 1)
y = train["math score"]

In [274]:
from sklearn.preprocessing import PowerTransformer
y = y.values.reshape(-1,1)
# power transform the raw data
power = PowerTransformer(method='yeo-johnson', standardize=True)
y = power.fit_transform(y)

In [275]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20)

In [276]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [277]:
print("Accuracy on train data: ",lr.score(X_train, y_train))
print("Accuracy on test data: ",lr.score(X_test, y_test))

Accuracy on train data:  0.23562875008255224
Accuracy on test data:  0.2511599803234317


In [278]:
from sklearn.metrics import mean_squared_error
print("RMSE on test data: ", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE on test data:  0.8713786411847373


In [279]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [280]:
rf.score(X_test, y_test)

0.08139490268455207

In [281]:
print("RMSE on test data: ", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE on test data:  0.9651099942245421


In [243]:
test = test.drop("parental level of education", axis = 1)

In [244]:
#Regressions
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [245]:
et = ExtraTreesRegressor(n_jobs=-1,n_estimators=500)
rf = RandomForestRegressor(max_depth=8,bootstrap=True,n_jobs=-1,n_estimators=200,max_features='sqrt')
xgb = xgboost.XGBRegressor(n_jobs=-1,colsample_bytree=0.7, learning_rate=0.07,
                           max_depth=5, min_child_weight=4, n_estimators=500,nthread=4, subsample=0.7)
gbr = GradientBoostingRegressor(random_state=0)
xgb = xgboost.XGBRegressor(n_jobs=-1)
et = ExtraTreesRegressor(n_jobs=-1)
ds = DecisionTreeRegressor()
catboost_reg = CatBoostRegressor()

In [246]:
pipelines = {
"LinearRegression": LinearRegression(),
"AdaBoostRegressor":AdaBoostRegressor(random_state=0, n_estimators=100),
"ElasticNet":ElasticNet(random_state=0),
"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
"DecisionTreeRegressor": DecisionTreeRegressor(),
"ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
"RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
"XGBRegressor":xgboost.XGBRegressor(n_jobs=-1),
"LGBM": LGBMRegressor(boosting_type='rf',num_leaves=31,max_depth=-1,learning_rate=0.1,n_estimators=150,subsample_for_bin=200000),
"CatBoostRegressor":CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05,loss_function = 'Logloss')
}

In [247]:
dic =  {"Model":[],"R2_Train":[],"RMSE_Train":[]}
for name, model in pipelines.items():
    if name == 'CatBoostRegressor':
        model.fit(X_train, y_train,verbose=False)
    else:
        model.fit(X_train, y_train)
    #y_test_pre = model.predict(test_df)
    predict_train = model.predict(X_train)
    predict_test  = model.predict(X_test)
    print("--------------------------------------------------------------")
    print("Model:", name)
    print("-----Training Data Evalution-----")
    print("Train Score:",model.score(X_train, y_train))
    print("Test Score:",model.score(X_test, y_test))
    print('RMSE on train data: ', np.sqrt(mean_squared_error(y_train, predict_train)))
    print('RMSE on test data: ',  np.sqrt(mean_squared_error(y_test, predict_test)))
    dic["Model"].append(name)

--------------------------------------------------------------
Model: LinearRegression
-----Training Data Evalution-----
Train Score: 0.5888521410639406
Test Score: 0.5556341755092785
RMSE on train data:  0.6336407807839352
RMSE on test data:  0.6960819969056924
--------------------------------------------------------------
Model: AdaBoostRegressor
-----Training Data Evalution-----
Train Score: 0.6792002436475412
Test Score: 0.636514856824457
RMSE on train data:  0.5597074902003877
RMSE on test data:  0.6295546100570265
--------------------------------------------------------------
Model: ElasticNet
-----Training Data Evalution-----
Train Score: 0.11055189460863912
Test Score: 0.08252144487159896
RMSE on train data:  0.931975318583273
RMSE on test data:  1.0002027337450183
--------------------------------------------------------------
Model: GradientBoostingRegressor
-----Training Data Evalution-----
Train Score: 0.7428981942202884
Test Score: 0.665257969424129
RMSE on train data:  0.5

TypeError: Wrong type(ndarray) for label.
It should be list, numpy 1-D array or pandas Series

In [113]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

In [115]:
print("Accuracy on train data: ",gb.score(X_train, y_train))
print("Accuracy on test data: ",gb.score(X_test, y_test))

Accuracy on train data:  0.9194943408364229
Accuracy on test data:  0.8814629624331839


In [116]:
print("RMSE on train data: ", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE on train data:  0.3567736264182878


In [174]:
import pickle
# open a file, where you ant to store the data
file = open('LinearReg.pkl', 'wb')

# dump information to that file
pickle.dump(lr, file)

In [175]:
result = lr.predict(test)

In [176]:
results = pd.DataFrame(data=result)
results.head()

Unnamed: 0,0
0,-0.057769
1,0.194762
2,1.575242
3,-0.628731
4,0.426013


In [177]:
a_inverse_transformed = power.inverse_transform(results)

In [178]:
res = pd.DataFrame(data=a_inverse_transformed, columns={'Math score': '0'})
res.head()

Unnamed: 0,Math score
0,66.107683
1,69.878541
2,89.554126
3,57.333341
4,73.279541


In [179]:
res.to_csv("LinearReg.csv")