In [27]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [28]:
df = pd.read_csv("winequality.csv")

In [29]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [30]:
df = df.rename(columns = {'volatile acidity' : 'volatile_acidity', 'free sulfur dioxide' : 'free_sulfur_dioxide', 'total sulfur dioxide' : 'total_sulfur_dioxide'})

In [31]:
df.head()

Unnamed: 0,fixed acidity,volatile_acidity,citric acid,residual sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [32]:
X = df.iloc[:,0:11]  
y = df.iloc[:,-1]

In [33]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

In [34]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [35]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

In [36]:
featureScores

Unnamed: 0,Specs,Score
0,fixed acidity,11.260652
1,volatile_acidity,15.580289
2,citric acid,13.025665
3,residual sugar,4.123295
4,chlorides,0.752426
5,free_sulfur_dioxide,161.936036
6,total_sulfur_dioxide,2755.557984
7,density,0.00023
8,pH,0.154655
9,sulphates,4.558488


In [37]:
print(featureScores.nlargest(4,'Score'))

                   Specs        Score
6   total_sulfur_dioxide  2755.557984
5    free_sulfur_dioxide   161.936036
10               alcohol    46.429892
1       volatile_acidity    15.580289


In [38]:
numerical = ['total_sulfur_dioxide', 'free_sulfur_dioxide', 'alcohol', 'volatile_acidity']

In [39]:
train_dicts = df[numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'quality'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

0.663539653608919

In [40]:
def read_dataframe(filename: str):
    df = pd.read_csv(filename)
    return df

In [41]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [44]:
X_tr, X_va, y_tr, y_va = train_test_split(X_tr, y_tr, test_size=0.2, random_state=42)

In [45]:
len(X_tr), len(X_va)

(1023, 256)

In [46]:
dv = DictVectorizer()

In [47]:
train_dicts = X_tr[numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = X_va[numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [48]:
y_va.head()

975     5
455     8
1593    6
36      6
713     5
Name: quality, dtype: int64

In [49]:
target = 'quality'
y_train = y_tr.values
y_val = y_va.values

In [50]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

0.6616905655218737

In [51]:
def dump_pickle(obj, filename):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [52]:
with open('lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)