In [3]:
import seaborn as sns
import pandas as pd
import numpy as np
import csv

# reformat data (change semi-colons to commas)
reader = csv.reader(open('./winequality-red.csv', "r"), delimiter=';')
writer = csv.writer(open('./winequality-red-reformatted.csv', 'w'), delimiter=',')
writer.writerows(reader)

# read reformatted data using Pandas
Data = pd.read_csv('./winequality-red-reformatted.csv')

### Data Preprocessing

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# create an array 'target' & populate it with False if element in quality 
# column is less than or equal to quality's median value; else populte with True
target = [0 if q <= np.median(Data.quality) else 1 for q in Data.quality]

# create a Pandas series from 'target' array
Data["target"] = pd.Series(target)
print(Data["target"])

# print out the frequency of each class in 'target' column
print("\nFrequency of each class in 'target' column:")
print(Data["target"].value_counts(), "\n")

# The frquency of ones in the target column is much lower than the frequency of
# zeros, so it might be a good idea to oversample the training data

# create instances of minmaxscaler & logistic regression
scaler = MinMaxScaler()
reg = LogisticRegression()

# drop columns & scale data
X = Data.drop(columns = ['target', 'quality'])
X = scaler.fit_transform(X)
y = Data.target

# split the data into training and testing set with the ratio of 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

# oversample the training data
sm = SMOTE(random_state = 20)
X_resamp, y_resamp = sm.fit_resample(X_train, y_train)

# print out the frequency of each class in 'target' column after oversample
print("\nFrequency of each class in 'target' column:")
print(y_resamp.value_counts(), "\n")

# rescale data
scaler.fit(X_resamp, y_resamp)

0       0
1       0
2       0
3       0
4       0
       ..
1594    0
1595    0
1596    0
1597    0
1598    0
Name: target, Length: 1599, dtype: int64

Frequency of each class in 'target' column:
0    1382
1     217
Name: target, dtype: int64 


Frequency of each class in 'target' column:
0    1108
1    1108
Name: target, dtype: int64 

