In [None]:
## IMPORTING THE LIBRARIES
import numpy as np  # linear algebra
import pandas as pd  # Handling dataframes 
import seaborn as sns #For plotting
import matplotlib.pyplot as plt
#from datetime import datetime #For fancy timestamps

In [None]:
#LOADING THE DATASET
data_wine=pd.read_csv('C:/Users/Varghese/Desktop/Code/Knight ML Assignment/Data/train.csv')
data_wine.head()

In [None]:
data_wine.shape

In [None]:
#CREATING A NEW DATASET (df1) WITH JUST REVIEW DESCRIPTION AND VARIETY NAME; FOR A PURELY TEXT BASED CLASSIFIER
df1 = data_wine[['review_description', 'variety']].copy()
df1

In [None]:
#VIEWING DTYPES, AND NULL-VALUES IN THE DATASET
df1.info()

In [None]:
#ADDING A NUMERICAL CATEGORY ID TO EACH OF THE VARIETY
df1['category_id'] = df1['variety'].factorize()[0]
category_id_df = df1[['variety', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'variety']].values)
df1

In [None]:
#PLOTTING THE DESCRIPTION BY VARIETY
fig = plt.figure(figsize=(8,6))
df1.groupby('variety').review_description.count().plot.bar(ylim=0)
plt.show()

In [None]:
# IMPORT Term Frequency, Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer
#Change min_df value to <10 for best results. WARNING: CONSUMES HIGH MEMORY
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=100, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df1.review_description).toarray()
labels = df1.category_id
features.shape

In [None]:
from sklearn.feature_selection import chi2
N = 2
for variety, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(variety))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df1['review_description'], df1['variety'], random_state = 42)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
#PREDICTION 1
print(clf.predict(count_vect.transform(["This wine is near equal parts Syrah and Merlot with the balance Cabernet Sauvignon. Aromas of blue fruit, vanilla, cherry and herb lead to full-bodied pit-fruit flavors that bring a sense of deliciousness that is hard to resist."])))

In [None]:
#CHECK 1
df1[df1['review_description'] == "This wine is near equal parts Syrah and Merlot with the balance Cabernet Sauvignon. Aromas of blue fruit, vanilla, cherry and herb lead to full-bodied pit-fruit flavors that bring a sense of deliciousness that is hard to resist."]

In [None]:
#PREDICTION 2
print(clf.predict(count_vect.transform(["An older vintage, this is a soft and malleable wine dusty in tannin and oak, with light-bodied flavors and aromas of blackberry, cherry and chocolate."])))

In [None]:
#CHECK 2
df1[df1['review_description'] == "An older vintage, this is a soft and malleable wine dusty in tannin and oak, with light-bodied flavors and aromas of blackberry, cherry and chocolate."]

In [None]:
# BENCHMARKING ON MULTIPLE MODELS
# Logistic regression; Random Forest Classifier; Linear SupportVector Classifier; Naive Bayes (Multinomial)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=42),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
#ASSUMING LINEARSVC GAVE THE BEST SCORE IN THE CELL ABOVE
model = LinearSVC() # RandomForestClassifier(n_estimator=200, max_depth=3, random_state=42) # MultinomialNB() # LogisticRegression()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df1.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df1.variety.values, yticklabels=category_id_df.variety.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
#The majority of predictions should be on the diagonals

In [None]:
#Checking the discrepancies here, using confusion matrix for false positives.
from IPython.display import display
for predicted in category_id_df1.category_id:
  for actual in category_id_df1.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df1.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['variety', 'review_description']])
      print('')

In [None]:
#Not an important step. Just a cross-verification.
model.fit(features, labels)
N = 2
for variety, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("# '{}':".format(Product))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
  print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
#SCORE-SHEET FOR EACH VARIETY
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['Product'].unique()))