In [61]:
import numpy as np
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
# import the required training and testing dataset
df = pd.read_csv("Train.csv")
df.shape

(3132, 9)

In [63]:
tst = pd.read_csv("Test.csv")
tst.shape

(1045, 9)

In [64]:
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Target
0,F,0.615,0.455,0.135,1.059,0.4735,0.263,0.274,9
1,F,0.515,0.395,0.14,0.686,0.281,0.1255,0.22,12
2,M,0.66,0.53,0.175,1.583,0.7395,0.3505,0.405,10
3,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
4,M,0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35,6


In [65]:
# f, ax = plt.subplots(figsize=(10, 8))
# corr = df.corr()
# sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool),
#             cmap=sns.diverging_palette(220, 10, as_cmap=True),
#             square=True, ax=ax)

In [66]:
# checking for missing values
df.isnull().sum()

A         0
B         0
C         0
D         0
E         0
F         0
G         0
H         0
Target    0
dtype: int64

In [67]:
# checking datatypes to see whether encoding is required or not
for i in df.columns :
    print(df[i].dtype)

object
float64
float64
float64
float64
float64
float64
float64
int64


In [68]:
y = df['Target']
df.drop('Target',axis=1,inplace=True)
print(y)

0        9
1       12
2       10
3       15
4        6
        ..
3127     9
3128    12
3129    11
3130     9
3131     6
Name: Target, Length: 3132, dtype: int64


In [69]:
# No missing values so no need of imputer.
# imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
# imputer.fit(df.iloc[:,[1,2,3,4,5,6,7]].values)
# df.iloc[:,[1,2,3,4,5,6,7]] = imputer.transform(df.iloc[:,[1,2,3,4,5,6,7]].values)

In [78]:
#using dummy encoding on column A and not label encoding as no rank was given
final_features = pd.get_dummies(df,drop_first=True)

In [79]:
final_features

Unnamed: 0,B,C,D,E,F,G,H,A_I,A_M
0,0.615,0.455,0.135,1.0590,0.4735,0.2630,0.274,0,0
1,0.515,0.395,0.140,0.6860,0.2810,0.1255,0.220,0,0
2,0.660,0.530,0.175,1.5830,0.7395,0.3505,0.405,0,1
3,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.150,0,1
4,0.495,0.400,0.155,0.8085,0.2345,0.1155,0.350,0,1
...,...,...,...,...,...,...,...,...,...
3127,0.490,0.400,0.115,0.5690,0.2560,0.1325,0.145,0,0
3128,0.670,0.550,0.190,1.3905,0.5425,0.3035,0.400,0,0
3129,0.510,0.395,0.125,0.5805,0.2440,0.1335,0.188,0,1
3130,0.575,0.465,0.120,1.0535,0.5160,0.2185,0.235,0,1


In [72]:
for i in final_features.columns:
    print(final_features[i].dtype)

float64
float64
float64
float64
float64
float64
float64
uint8
uint8


In [88]:
# fitting model using multi linear regression object
mlr = LinearRegression()
mlr.fit(final_features[["B",'C','D','E',"F","G","H",'A_I','A_M']],y)

LinearRegression()

In [89]:
mlr.score(final_features[["B",'C','D','E','F',"G","H",'A_I','A_M']],y)

0.5344827192989167

In [90]:
final_features1 = pd.get_dummies(tst,drop_first=True)

In [91]:
final_features1

Unnamed: 0,Index,B,C,D,E,F,G,H,A_I,A_M
0,866,0.605,0.455,0.160,1.1035,0.4210,0.3015,0.325,0,1
1,1483,0.590,0.440,0.150,0.8725,0.3870,0.2150,0.245,0,1
2,599,0.560,0.445,0.195,0.9810,0.3050,0.2245,0.335,0,0
3,1702,0.635,0.490,0.170,1.2615,0.5385,0.2665,0.380,0,0
4,670,0.475,0.385,0.145,0.6175,0.2350,0.1080,0.215,0,1
...,...,...,...,...,...,...,...,...,...,...
1040,532,0.470,0.370,0.120,0.4705,0.1845,0.1055,0.155,1,0
1041,3417,0.580,0.460,0.150,1.0165,0.4910,0.2210,0.265,0,1
1042,1505,0.635,0.510,0.185,1.3080,0.5440,0.3180,0.377,0,1
1043,2245,0.460,0.375,0.140,0.5105,0.1920,0.1045,0.205,0,1


In [93]:
#predicting output
ans = mlr.predict(final_features1.iloc[:,[1,2,3,4,5,6,7,8,9]])

In [95]:
# making dataframe for submission
ans_p = pd.DataFrame({
    "Index":tst["Index"],"Target":ans
})

In [96]:
ans_p

Unnamed: 0,Index,Target
0,866,11.798566
1,1483,10.253717
2,599,14.083293
3,1702,12.032196
4,670,11.209152
...,...,...
1040,532,9.138743
1041,3417,9.748612
1042,1505,12.283992
1043,2245,10.921454


In [100]:
df_final = pd.DataFrame(ans_p,index=None)

In [101]:
# saving as csv file
df_final.to_csv("Ans.csv")