# Machine learning

### Loading Libraries

In [4]:
import plotly.express as px
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
# Import label encoder

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import linear_model
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings('ignore')

#### Loading data

In [5]:
df = pd.read_csv(r"../data/hsmsx.csv")
df.head()

Unnamed: 0,conduct,age,gender,nationality,pob,tutor/makeup,grade,admission,guardian,time_with_parents,...,physics,gpa,literature,biology,geography,history,civics,ent-ship,matrik,level
0,a,10.0,f,ethiopian,ethiopia,yes,5,paid,both,a lot,...,,89.4,,,,,,,,middleschool
1,b,12.0,m,ethiopian,ethiopia,no,5,scholarship,both,little,...,,81.9,,,,,,,,middleschool
2,a,10.0,m,ethiopian,ethiopia,yes,5,paid,both,a lot,...,,87.9,,,,,,,,middleschool
3,c,11.0,m,ethiopian,ethiopia,no,5,paid,both,a lot,...,,85.9,,,,,,,,middleschool
4,b,11.0,f,ethiopian,foriegn_soil,no,5,paid,mother,a lot,...,,95.5,,,,,,,,middleschool


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   conduct                     287 non-null    object 
 1   age                         287 non-null    float64
 2   gender                      287 non-null    object 
 3   nationality                 287 non-null    object 
 4   pob                         287 non-null    object 
 5   tutor/makeup                287 non-null    object 
 6   grade                       287 non-null    int64  
 7   admission                   287 non-null    object 
 8   guardian                    287 non-null    object 
 9   time_with_parents           287 non-null    object 
 10  communication_with_parents  287 non-null    object 
 11  reward_for_achievements     287 non-null    object 
 12  parent_understanding        287 non-null    object 
 13  transport_type              287 non

In [7]:
df.columns

Index(['conduct', 'age', 'gender', 'nationality', 'pob', 'tutor/makeup',
       'grade', 'admission', 'guardian', 'time_with_parents',
       'communication_with_parents', 'reward_for_achievements',
       'parent_understanding', 'transport_type', 'residence', 'travel time',
       'birth_order', 'exercice', 'previous_school', 'mother_tongue',
       'curriculum', 'sleeping time_in_pm', 'parenting_style', 'trauma',
       'type', 'quality', 'english', 'amharic', 'arabic', 'geez', 'ict',
       'maths', 'ss', 'gp', 'is', 'moral', 'hpe', 'art', 'chemistry',
       'physics', 'gpa', 'literature', 'biology', 'geography', 'history',
       'civics', 'ent-ship', 'matrik', 'level'],
      dtype='object')

In [8]:
# conduct, age, gender, pob, tutor, grade, admission, guardian, p4, transport, birthorder,
# previousschool, mother_tongue, parenting_style, quality, maths, english, 



#combination
#feature
#admission

In [9]:
dfx = df[['conduct', 'age', 'gender', 'pob', 'tutor/makeup', 'grade', 'admission', 'guardian', 'transport_type', 
          'birth_order', 'mother_tongue', 'parenting_style', 'maths', 'english', 'gpa'
          ]]

In [10]:
dfx.conduct.value_counts()  #label encoded  abc
dfx.age.value_counts()      # dtype int
dfx.gender.value_counts()   # f and m  one hot encoding
dfx.pob.value_counts()      # ethiopia or foriegn soil one hot encoding
dfx['tutor/makeup'].value_counts()   #yes or no 
dfx.grade.value_counts()     #5-12
dfx.admission.value_counts() #one hot encoding
dfx.guardian.value_counts() #single #both #other mother and father to be merged into single parents &&&& #oe hot
dfx.transport_type.value_counts() #private, service, public, onfoot, one hot encoding
dfx.birth_order.value_counts() #first, middle, last #one hot encoding
dfx.mother_tongue.value_counts()  #one hot
dfx.parenting_style.value_counts() # one hot

authoritative    222
authoritarian     52
permissive        12
uninvolved         1
Name: parenting_style, dtype: int64

#### Removing Outliers

In [11]:
fig = px.box(dfx, y="maths")
fig.show()

In [12]:
dfx[dfx.maths == 27.53]

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,guardian,transport_type,birth_order,mother_tongue,parenting_style,maths,english,gpa
281,b,18.0,f,ethiopia,no,12,paid,mother,private,last child,for-lang,authoritarian,27.53,67.67,63.7


In [13]:
dfx.loc[281]

conduct                        b
age                         18.0
gender                         f
pob                     ethiopia
tutor/makeup                  no
grade                         12
admission                   paid
guardian                  mother
transport_type           private
birth_order           last child
mother_tongue           for-lang
parenting_style    authoritarian
maths                      27.53
english                    67.67
gpa                         63.7
Name: 281, dtype: object

In [14]:
dfx= dfx.drop(281)

In [15]:
fig = px.box(dfx, y="age")
fig.show()

In [16]:
dfx.maths.min()

58.36

In [17]:
dfx.isna().sum()

conduct            0
age                0
gender             0
pob                0
tutor/makeup       0
grade              0
admission          0
guardian           0
transport_type     0
birth_order        0
mother_tongue      0
parenting_style    0
maths              0
english            0
gpa                0
dtype: int64

In [18]:
dummies = pd.get_dummies(df.birth_order)

In [19]:
merged = pd.concat([dfx, dummies], axis = 'columns')
merged = merged.drop(['birth_order'], axis='columns')
merged 

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,guardian,transport_type,mother_tongue,parenting_style,maths,english,gpa,first child,last child,middle child
0,a,10.0,f,ethiopia,yes,5.0,paid,both,private,eth-lang,authoritative,76.38,94.86,89.4,0,0,1
1,b,12.0,m,ethiopia,no,5.0,scholarship,both,public,eth-lang,authoritative,74.92,74.12,81.9,1,0,0
2,a,10.0,m,ethiopia,yes,5.0,paid,both,private,eth-lang,authoritative,84.33,80.12,87.9,0,1,0
3,c,11.0,m,ethiopia,no,5.0,paid,both,private,eth-lang,authoritative,90.63,82.67,85.9,0,1,0
4,b,11.0,f,foriegn_soil,no,5.0,paid,mother,private,for-lang,authoritative,96.00,96.31,95.5,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,b,18.0,f,ethiopia,no,12.0,paid,both,private,eth-lang,authoritative,66.71,79.77,83.4,0,0,1
284,a,17.0,f,ethiopia,yes,12.0,paid,both,service,eth-lang,authoritative,64.07,91.40,84.2,0,0,1
285,a,18.0,f,ethiopia,yes,12.0,paid,both,private,eth-lang,authoritative,65.91,77.82,82.9,0,1,0
286,a,18.0,m,ethiopia,no,12.0,paid,both,private,eth-lang,authoritative,72.30,86.53,86.6,1,0,0


In [20]:
dummies2 = pd.get_dummies(df.parenting_style)
merged2 = pd.concat([merged, dummies2], axis = 'columns')
merged2 = merged2.drop(['parenting_style', 'uninvolved'], axis='columns')
merged2

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,guardian,transport_type,mother_tongue,maths,english,gpa,first child,last child,middle child,authoritarian,authoritative,permissive
0,a,10.0,f,ethiopia,yes,5.0,paid,both,private,eth-lang,76.38,94.86,89.4,0,0,1,0,1,0
1,b,12.0,m,ethiopia,no,5.0,scholarship,both,public,eth-lang,74.92,74.12,81.9,1,0,0,0,1,0
2,a,10.0,m,ethiopia,yes,5.0,paid,both,private,eth-lang,84.33,80.12,87.9,0,1,0,0,1,0
3,c,11.0,m,ethiopia,no,5.0,paid,both,private,eth-lang,90.63,82.67,85.9,0,1,0,0,1,0
4,b,11.0,f,foriegn_soil,no,5.0,paid,mother,private,for-lang,96.00,96.31,95.5,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,b,18.0,f,ethiopia,no,12.0,paid,both,private,eth-lang,66.71,79.77,83.4,0,0,1,0,1,0
284,a,17.0,f,ethiopia,yes,12.0,paid,both,service,eth-lang,64.07,91.40,84.2,0,0,1,0,1,0
285,a,18.0,f,ethiopia,yes,12.0,paid,both,private,eth-lang,65.91,77.82,82.9,0,1,0,0,1,0
286,a,18.0,m,ethiopia,no,12.0,paid,both,private,eth-lang,72.30,86.53,86.6,1,0,0,0,1,0


In [21]:
df.transport_type.value_counts()

private    170
service     59
public      55
on foot      2
foot         1
Name: transport_type, dtype: int64

In [22]:
df['transport_type'] = df['transport_type'].replace('on foot', 'foot')
df.guardian = df.guardian.replace('mother', 'single')
df.guardian = df.guardian.replace('father', 'single')
df.guardian.value_counts()

both      226
single     45
other      16
Name: guardian, dtype: int64

In [23]:
dummies3 = pd.get_dummies(df.transport_type)
merged3 = pd.concat([merged2, dummies3], axis = 'columns')
merged3 = merged3.drop(['transport_type', 'foot'], axis='columns')
merged3 

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,guardian,mother_tongue,maths,...,gpa,first child,last child,middle child,authoritarian,authoritative,permissive,private,public,service
0,a,10.0,f,ethiopia,yes,5.0,paid,both,eth-lang,76.38,...,89.4,0,0,1,0,1,0,1,0,0
1,b,12.0,m,ethiopia,no,5.0,scholarship,both,eth-lang,74.92,...,81.9,1,0,0,0,1,0,0,1,0
2,a,10.0,m,ethiopia,yes,5.0,paid,both,eth-lang,84.33,...,87.9,0,1,0,0,1,0,1,0,0
3,c,11.0,m,ethiopia,no,5.0,paid,both,eth-lang,90.63,...,85.9,0,1,0,0,1,0,1,0,0
4,b,11.0,f,foriegn_soil,no,5.0,paid,mother,for-lang,96.00,...,95.5,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,b,18.0,f,ethiopia,no,12.0,paid,both,eth-lang,66.71,...,83.4,0,0,1,0,1,0,1,0,0
284,a,17.0,f,ethiopia,yes,12.0,paid,both,eth-lang,64.07,...,84.2,0,0,1,0,1,0,0,0,1
285,a,18.0,f,ethiopia,yes,12.0,paid,both,eth-lang,65.91,...,82.9,0,1,0,0,1,0,1,0,0
286,a,18.0,m,ethiopia,no,12.0,paid,both,eth-lang,72.30,...,86.6,1,0,0,0,1,0,1,0,0


In [24]:
dummies4 = pd.get_dummies(df.guardian)
merged4 = pd.concat([merged3, dummies4], axis = 'columns')
merged4 = merged4.drop(['guardian', 'other'], axis='columns')
merged4.drop([281], axis=0, inplace=True)
merged4 

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,mother_tongue,maths,english,...,last child,middle child,authoritarian,authoritative,permissive,private,public,service,both,single
0,a,10.0,f,ethiopia,yes,5.0,paid,eth-lang,76.38,94.86,...,0,1,0,1,0,1,0,0,1,0
1,b,12.0,m,ethiopia,no,5.0,scholarship,eth-lang,74.92,74.12,...,0,0,0,1,0,0,1,0,1,0
2,a,10.0,m,ethiopia,yes,5.0,paid,eth-lang,84.33,80.12,...,1,0,0,1,0,1,0,0,1,0
3,c,11.0,m,ethiopia,no,5.0,paid,eth-lang,90.63,82.67,...,1,0,0,1,0,1,0,0,1,0
4,b,11.0,f,foriegn_soil,no,5.0,paid,for-lang,96.00,96.31,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,b,17.0,f,ethiopia,no,12.0,paid,eth-lang,74.35,92.25,...,0,1,0,1,0,0,0,1,1,0
283,b,18.0,f,ethiopia,no,12.0,paid,eth-lang,66.71,79.77,...,0,1,0,1,0,1,0,0,1,0
284,a,17.0,f,ethiopia,yes,12.0,paid,eth-lang,64.07,91.40,...,0,1,0,1,0,0,0,1,1,0
285,a,18.0,f,ethiopia,yes,12.0,paid,eth-lang,65.91,77.82,...,1,0,0,1,0,1,0,0,1,0


In [25]:
def conx(x):
    if x == 'a':
        return 3
    elif x == 'b':
        return 2
    else:
        return 1

In [26]:
merged4.conduct.value_counts()

a    221
b     50
c     15
Name: conduct, dtype: int64

In [27]:
merged4.conduct = merged4.conduct.apply(conx)
merged4.conduct.value_counts()

3    221
2     50
1     15
Name: conduct, dtype: int64

In [28]:
merged4['pob']= label_encoder.fit_transform(merged4['pob'])
merged4['gender']= label_encoder.fit_transform(merged4['gender'])
merged4['tutor/makeup']= label_encoder.fit_transform(merged4['tutor/makeup'])
merged4['admission']= label_encoder.fit_transform(merged4['admission'])
merged4['mother_tongue']= label_encoder.fit_transform(merged4['mother_tongue'])

In [29]:
merged4

Unnamed: 0,conduct,age,gender,pob,tutor/makeup,grade,admission,mother_tongue,maths,english,...,last child,middle child,authoritarian,authoritative,permissive,private,public,service,both,single
0,3,10.0,0,0,1,5.0,0,0,76.38,94.86,...,0,1,0,1,0,1,0,0,1,0
1,2,12.0,1,0,0,5.0,1,0,74.92,74.12,...,0,0,0,1,0,0,1,0,1,0
2,3,10.0,1,0,1,5.0,0,0,84.33,80.12,...,1,0,0,1,0,1,0,0,1,0
3,1,11.0,1,0,0,5.0,0,0,90.63,82.67,...,1,0,0,1,0,1,0,0,1,0
4,2,11.0,0,1,0,5.0,0,1,96.00,96.31,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,2,17.0,0,0,0,12.0,0,0,74.35,92.25,...,0,1,0,1,0,0,0,1,1,0
283,2,18.0,0,0,0,12.0,0,0,66.71,79.77,...,0,1,0,1,0,1,0,0,1,0
284,3,17.0,0,0,1,12.0,0,0,64.07,91.40,...,0,1,0,1,0,0,0,1,1,0
285,3,18.0,0,0,1,12.0,0,0,65.91,77.82,...,1,0,0,1,0,1,0,0,1,0


In [30]:
len(merged4.columns)

22

In [31]:
merged4.dtypes

conduct            int64
age              float64
gender             int64
pob                int64
tutor/makeup       int64
grade            float64
admission          int64
mother_tongue      int64
maths            float64
english          float64
gpa              float64
first child        uint8
last child         uint8
middle child       uint8
authoritarian      uint8
authoritative      uint8
permissive         uint8
private            uint8
public             uint8
service            uint8
both               uint8
single             uint8
dtype: object

In [32]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(merged4)
scaled_data

array([[ 0.50535836, -2.07974967, -0.95888638, ..., -0.50981555,
         0.51525404, -0.42640143],
       [-1.30129777, -1.16641623,  1.04287643, ..., -0.50981555,
         0.51525404, -0.42640143],
       [ 0.50535836, -2.07974967,  1.04287643, ..., -0.50981555,
         0.51525404, -0.42640143],
       ...,
       [ 0.50535836,  1.11691739, -0.95888638, ...,  1.96149372,
         0.51525404, -0.42640143],
       [ 0.50535836,  1.57358411, -0.95888638, ..., -0.50981555,
         0.51525404, -0.42640143],
       [ 0.50535836,  1.57358411,  1.04287643, ..., -0.50981555,
         0.51525404, -0.42640143]])

In [33]:

normalized_data = normalize(scaled_data)
type(normalized_data)

numpy.ndarray

In [34]:
df_new = pd.DataFrame(normalized_data, columns = merged4.columns)

In [35]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   conduct        286 non-null    float64
 1   age            286 non-null    float64
 2   gender         286 non-null    float64
 3   pob            286 non-null    float64
 4   tutor/makeup   286 non-null    float64
 5   grade          286 non-null    float64
 6   admission      286 non-null    float64
 7   mother_tongue  286 non-null    float64
 8   maths          286 non-null    float64
 9   english        286 non-null    float64
 10  gpa            286 non-null    float64
 11  first child    286 non-null    float64
 12  last child     286 non-null    float64
 13  middle child   286 non-null    float64
 14  authoritarian  286 non-null    float64
 15  authoritative  286 non-null    float64
 16  permissive     286 non-null    float64
 17  private        286 non-null    float64
 18  public    

In [36]:
X = merged4[['conduct', 'age', 'gender', 'pob', 'tutor/makeup', 'grade', 'admission','mother_tongue', 'maths', 'english', 'first child', 'last child','middle child', 'authoritarian', 'authoritative', 'permissive','private', 'public', 'service', 'both', 'single']]
Y = merged4['gpa']

In [37]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.3)
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [38]:
clf.fit(x_train,y_train)

LinearRegression()

In [39]:
clf.predict(x_test)

array([80.66067481, 97.52285605, 84.43778038, 90.95519247, 94.21071343,
       94.24453872, 84.84279191, 95.95038184, 97.09837366, 82.48720142,
       82.41443443, 94.4939157 , 82.00484024, 80.34789443, 88.70613483,
       85.73622907, 88.25241441, 79.61656389, 91.78070007, 89.15685534,
       85.53698366, 95.72590091, 94.52213546, 94.02523582, 78.22265651,
       90.58376858, 87.82484713, 79.20929791, 93.11473834, 78.27228746,
       90.16249405, 87.63960251, 88.00594926, 93.76019388, 91.94145813,
       85.09487925, 92.38236907, 87.75680845, 92.63726223, 88.22015056,
       75.74385825, 87.41329632, 90.75103188, 86.71322506, 85.66306072,
       88.73670985, 90.81241421, 77.45370116, 90.15868164, 78.41988168,
       83.75123063, 91.84729099, 88.11728596, 96.46381789, 90.24311493,
       88.74130627, 81.53958632, 82.04688637, 86.45226528, 97.44870624,
       82.30074429, 78.35664228, 97.58292854, 87.64636249, 90.45514137,
       87.07275375, 80.53652723, 96.48641607, 82.72308858, 84.87

In [40]:
clf.score(x_test,y_test)

0.742712864754042

In [41]:
clf.predict([[1,14,1,1,1,9,1,0,85,93,0,0,1,1,0,0,1,0,0,0,1]])

array([88.53410604])

In [42]:
with open("../model/new.pkl", "wb") as f:
    pickle.dump(clf, f)

In [46]:
cv = ShuffleSplit(n_splits=6,test_size=0.2,random_state=1)

cross_val_score(LinearRegression(),X,Y,cv=cv)

array([0.65560558, 0.79172255, 0.73042773, 0.74329705, 0.70476885,
       0.66246136])

In [43]:
merged4['gender'].value_counts()

0    149
1    137
Name: gender, dtype: int64

In [44]:
def find_best_model_using_gridsearch(X,y):

    # Creating a models dictionary containing all the parameters and model names

    algos = {

        'linear regression': {
            'model': LinearRegression(),
            'params': {
                'normalize':[True,False]
            }
        },

        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection':['random','cyclic']
            }
        },

        'decision tree':  {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }

    }

    scores = []

    # Performing k-fold cross validation and finding the best model with the best parameters

    cv = ShuffleSplit(n_splits = 5,test_size = 0.2,random_state = 1)

    for algo_name, config in algos.items():

        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)

        gs.fit(X,y)

        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [45]:
find_best_model_using_gridsearch(X,Y)

Unnamed: 0,model,best_score,best_params
0,linear regression,0.725164,{'normalize': False}
1,lasso,0.719192,"{'alpha': 1, 'selection': 'random'}"
2,decision tree,0.534966,"{'criterion': 'friedman_mse', 'splitter': 'best'}"
