In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

from sklearn import model_selection
from sklearn import tree, linear_model, ensemble, neighbors, svm
from sklearn import metrics

from warnings import filterwarnings

%matplotlib inline
sns.set_context("notebook")
plt.style.use("fivethirtyeight")
filterwarnings("ignore")



We can check basic info about data, like features, NA entries, etc. using pandas library 
functions

In [None]:
df = pd.read_csv("training.csv")
len(df['DNA'])
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
df.info()
df.head()
len(df['DNA'][3])

df['DNA'].apply(lambda x: pd.Series(list(x))).info()


After that, we check if there are any undefined entries. We cannot continue analysis until we replace them with some meaningful values (like mean or median of that column).

In [None]:
print(df.isnull().sum())

Since there are no null values we can safely proceed. As the next step, we should check the ratios of the classes in the data

In [None]:
print(df['quality'].value_counts())
sns.countplot(df['quality'])
plt.xlabel("Quality")
plt.ylabel("Count")
plt.title("Class Counts")
set(df['quality'])

We see that there are mainly average class wines in the datasets, and very few excellent or poor wines. We group them in three categories: poor if quality is less than five, excellent if quality is greater than 6, else average

In [None]:
poor = df['quality'] < 5
excellent = df['quality'] > 6
average = ~(poor | excellent)
df_orig = df.copy()

In [None]:
df = df_orig.copy()
df.loc[poor, 'quality'] = 0
df.loc[average, 'quality'] = 1
df.loc[excellent, 'quality'] = 2
sns.countplot(df['quality'])
plt.xlabel("Quality")
plt.ylabel("Count")
plt.title("Class Counts")
set(df['quality'])
print(df['quality'].value_counts())

In [None]:
df.head()


In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df.drop(["quality"], axis=1),
    df["quality"],
    train_size = 0.8,
    random_state = 42) # What is the meaning of life

In [None]:
# Fit a decision tree on all training data:
tree = tree.DecisionTreeRegressor()
tree = tree.fit(X_train, y_train)

In [None]:
print("sklearn decision tree, training error: %f" %
      metrics.mean_absolute_error(y_train, tree.predict(X_train)))
print("sklearn decision tree, testing error: %f" %
      metrics.mean_absolute_error(y_test, tree.predict(X_test)))


In [None]:
# Fit linear regression on all training data:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

# Compute training error and testing error, this time on the linear
# regression model:
print("sklearn linear regression, training error: %f" %
      metrics.mean_absolute_error(y_train, lr.predict(X_train)))
print("sklearn linear regression, testing error: %f" %
      metrics.mean_absolute_error(y_test, lr.predict(X_test)))

In [None]:
knn = neighbors.KNeighborsClassifier()
knn.fit(X_train, y_train)

# Compute training error and testing error, this time on the KNN
# classifier model:
print("sklearn linear regression, training error: %f" %
      metrics.mean_absolute_error(y_train, knn.predict(X_train)))
print("sklearn linear regression, testing error: %f" %
      metrics.mean_absolute_error(y_test, knn.predict(X_test)))

In [None]:
svc = svm.SVC(C=0.01, gamma=0.1, kernel="poly", degree=5, coef0=10, probability=True)
svc.fit(X_train, y_train)

# Compute training error and testing error, this time on the Support Vector Machine
# classifier model:
print("sklearn linear regression, training error: %f" %
      metrics.mean_absolute_error(y_train,svc.predict(X_train)))
print("sklearn linear regression, testing error: %f" %
      metrics.mean_absolute_error(y_test, svc.predict(X_test)))