In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

pd.options.mode.chained_assignment = None

data = pd.read_csv("billets.csv", sep=";", decimal=',')
data

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,True,171.81,104.86,104.95,4.52,2.89,112.83
1,True,171.46,103.36,103.66,3.77,2.99,113.09
2,True,172.69,104.48,103.5,4.4,2.94,113.16
3,True,171.36,103.91,103.94,3.62,3.01,113.51
4,True,171.73,104.28,103.46,4.04,3.48,112.54
...,...,...,...,...,...,...,...
1495,False,171.75,104.38,104.17,4.42,3.09,111.28
1496,False,172.19,104.63,104.44,5.27,3.37,110.97
1497,False,171.8,104.01,104.12,5.51,3.36,111.95
1498,False,172.06,104.28,104.06,5.17,3.46,112.25


In [2]:
# replace is_genuine with boolean values

data["is_genuine"] = data["is_genuine"].replace(True, 1)
data["is_genuine"] = data["is_genuine"].replace(False, 0)

data.loc[data["is_genuine"] == 1]

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,1,171.81,104.86,104.95,4.52,2.89,112.83
1,1,171.46,103.36,103.66,3.77,2.99,113.09
2,1,172.69,104.48,103.5,4.4,2.94,113.16
3,1,171.36,103.91,103.94,3.62,3.01,113.51
4,1,171.73,104.28,103.46,4.04,3.48,112.54
...,...,...,...,...,...,...,...
995,1,171.66,103.92,103.47,4.26,2.83,113.2
996,1,172.16,103.72,103.61,4.3,2.72,113.51
997,1,171.78,103.38,104.22,4.23,3.07,113.77
998,1,171.44,103.96,103.92,3.68,2.89,113.21


In [3]:
# force value to be numeric

for col in data.columns[1:7]:
    data[col] = pd.to_numeric(data[col], errors = "raise")

In [4]:
# NaN count

data.isna().sum()

is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64

In [5]:
# linReg method to replace NaN

data_nan = data[data["margin_low"].isnull()]

print("number of NaN : ", len(data_nan))

number of NaN :  37


In [6]:
# isolate NaN rows

test_data_nan = data_nan.copy(deep = True)
test_data_nan

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
72,1,171.94,103.89,103.45,,3.25,112.79
99,1,171.93,104.07,104.18,,3.14,113.08
151,1,172.07,103.8,104.38,,3.02,112.93
197,1,171.45,103.66,103.8,,3.62,113.27
241,1,171.83,104.14,104.06,,3.02,112.36
251,1,171.8,103.26,102.82,,2.95,113.22
284,1,171.92,103.83,103.76,,3.23,113.29
334,1,171.85,103.7,103.96,,3.0,113.36
410,1,172.56,103.72,103.51,,3.12,112.95
413,1,172.3,103.66,103.5,,3.16,112.95


In [7]:
# linearRegression from data

data.dropna(inplace=True)

lin_reg = LinearRegression()

x_train = data.drop("margin_low", axis = 1)
y_train = data["margin_low"]

lin_reg.fit(x_train, y_train)

LinearRegression()

In [8]:
# predicted values to input 

x_test = data_nan.drop("margin_low", axis = 1)

y_pred = lin_reg.predict(x_test)
print(len(y_pred))
y_pred

37


array([4.06495361, 4.11199026, 4.13400328, 3.99357074, 4.1403993 ,
       4.09428392, 4.07412432, 4.12538999, 4.0807278 , 4.07363322,
       4.11897255, 4.18037978, 4.13648423, 4.05106842, 4.17837685,
       4.22555104, 4.11586845, 4.10284101, 4.08184346, 4.09276238,
       4.11250192, 4.15717623, 4.16028787, 4.12193808, 4.12353555,
       4.19842271, 4.10962313, 4.09696025, 4.13384101, 5.25968515,
       5.264817  , 5.28251853, 5.30206887, 5.20035843, 5.1754678 ,
       5.17345045, 5.24675055])

In [9]:
# replacing NaN values with predicted values

data_nan["margin_low"] = y_pred
data_nan

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
72,1,171.94,103.89,103.45,4.064954,3.25,112.79
99,1,171.93,104.07,104.18,4.11199,3.14,113.08
151,1,172.07,103.8,104.38,4.134003,3.02,112.93
197,1,171.45,103.66,103.8,3.993571,3.62,113.27
241,1,171.83,104.14,104.06,4.140399,3.02,112.36
251,1,171.8,103.26,102.82,4.094284,2.95,113.22
284,1,171.92,103.83,103.76,4.074124,3.23,113.29
334,1,171.85,103.7,103.96,4.12539,3.0,113.36
410,1,172.56,103.72,103.51,4.080728,3.12,112.95
413,1,172.3,103.66,103.5,4.073633,3.16,112.95


In [10]:
# merging tables

data = pd.concat([data, data_nan])
data

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,1,171.81,104.86,104.95,4.520000,2.89,112.83
1,1,171.46,103.36,103.66,3.770000,2.99,113.09
2,1,172.69,104.48,103.50,4.400000,2.94,113.16
3,1,171.36,103.91,103.94,3.620000,3.01,113.51
4,1,171.73,104.28,103.46,4.040000,3.48,112.54
...,...,...,...,...,...,...,...
1303,0,172.17,104.49,103.76,5.302069,2.93,111.21
1315,0,172.08,104.15,104.17,5.200358,3.40,112.29
1347,0,171.72,104.46,104.12,5.175468,3.61,110.31
1435,0,172.66,104.33,104.41,5.173450,3.56,111.47


In [11]:
# NaN check-up

data.isna().sum()

is_genuine      0
diagonal        0
height_left     0
height_right    0
margin_low      0
margin_up       0
length          0
dtype: int64

In [12]:
# comparing linearRegression with replace_by_mean method

test_data_nan["margin_low"] = data["margin_low"].mean()
test_data_nan

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
72,1,171.94,103.89,103.45,4.482844,3.25,112.79
99,1,171.93,104.07,104.18,4.482844,3.14,113.08
151,1,172.07,103.8,104.38,4.482844,3.02,112.93
197,1,171.45,103.66,103.8,4.482844,3.62,113.27
241,1,171.83,104.14,104.06,4.482844,3.02,112.36
251,1,171.8,103.26,102.82,4.482844,2.95,113.22
284,1,171.92,103.83,103.76,4.482844,3.23,113.29
334,1,171.85,103.7,103.96,4.482844,3.0,113.36
410,1,172.56,103.72,103.51,4.482844,3.12,112.95
413,1,172.3,103.66,103.5,4.482844,3.16,112.95


In [13]:
import scipy.stats as stats

stats.kstest(data_nan["margin_low"], test_data_nan["margin_low"])

KstestResult(statistic=0.7837837837837838, pvalue=1.726271215776404e-11)

In [14]:
# linearRegression from replace_by_mean values

margin_pred_reg = lin_reg.predict(data.drop("margin_low", axis = 1))
margin_pred_mean = np.ones_like(data["margin_low"]) * data["margin_low"].mean()

In [15]:
from sklearn.metrics import r2_score

# r2_score comparaison of both approches

r2_score(data["margin_low"], margin_pred_reg), r2_score(data["margin_low"], margin_pred_mean)

(0.6218878735010731, 2.220446049250313e-16)

In [16]:
%store data

Stored 'data' (DataFrame)
