In [1]:
from sklearn.preprocessing import Normalizer
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import cross_val_predict
import csv
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
# Import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

# Change it with the name of your dataset
filename = 'MHDataset.csv'

# Extract columns names (fieldnames)
with open(filename, 'r') as infile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames

# Remove last column (qualityClass)
fieldnames.pop()

# IMPORT DATASET
data = pd.read_csv(open(filename))

# My approach
print('My approach - First Dataset')

# Feature list
features_cols = ['characterCount', 'wordCount', 'syllableCount', 'sentenceCount', 'sectionCount', 'subsectionCount', 'paragraphCount', 'meanSectionSize', 'meanParagraphSize', 'largestSectionSize', 'shortestSectionSize', 'largestShortestSectionRatio', 'sectionSizeStandardDeviation', 'meanOfSubsectionsPerSection', 'abstractSize', 'abstractSizeArtcileLengthRatio', 'citationCount', 'citationCountPerTextLength', 'citationCountPerSection', 'externalLinksCount', 'externalLinksPerTextLength', 'externalLinksPerSection', 'imageCount', 'imagePerTextLength', 'imagePerSection', 'meanSentenceSize', 'largestSentenceSize', 'shortestSentenceSize', 'largeSentenceRate', 'shortSentenceRate', 'questionCount', 'questionRatio', 'exclamationCount', 'exclamationRatio', 'toBeVerbCount', 'toBeVerbRatio', 'toBeVerbPerSentence', 'toBeVerbRate', 'modalAuxiliaryVerbCount', 'modalAuxiliaryVerbsRatio', 'modalAuxiliaryVerbsPerSentence', 'modalAuxiliaryVerbsRate', 'passiveVoiceCount', 'passiveVoiceRatio', 'passiveVoicePerSentence', 'passiveVoiceRate', 'numberOfSentencesThatStartWithACoordinatingConjunction', 'numberOfSentencesThatStartWithADeterminer', 'numberOfSentencesThatStartWithASubordinatingPrepositionOrConjunction', 'numberOfSentencesThatStartWithAnAdjective', 'numberOfSentencesThatStartWithANoun', 'numberOfSentencesThatStartWithAPronoun', 'numberOfSentencesThatStartWithAnAdverb', 'numberOfSentencesThatStartWithAnArticle', 'numberOfSentencesThatStartWithACoordinatingConjunctionRatio', 'numberOfSentencesThatStartWithADeterminerRatio', 'numberOfSentencesThatStartWithASubordinatingPrepositionOrConjunctionRatio', 'numberOfSentencesThatStartWithAnAdjectiveRatio', 'numberOfSentencesThatStartWithANounRatio', 'numberOfSentencesThatStartWithAPronounRatio', 'numberOfSentencesThatStartWithAnAdverbRatio', 'numberOfSentencesThatStartWithAnArticleRatio', 'automatedReadabilityIndex', 'colemanLiauIndex', 'fleshReadingEase', 'fleschKincaidGradeLevel', 'gunningFogIndex', 'lasbarhetsIndex', 'smogGrading', 'daleChallReadabilityFormula', 'differentWordCount', 'differentWordsPerSentence', 'differentWordsRate', 'nounCount', 'nounsPerSentence', 'nounsRate', 'differentNounCount', 'differentNounsPerSentence', 'differentNounsRate', 'differentNounsDifferentWordsRatio', 'verbCount', 'verbsPerSentence', 'verbsRate', 'differentVerbCount', 'differentVerbsPerSentence', 'differentVerbsRate', 'differentVerbsDifferentWordsRatio', 'pronounCount', 'pronounsPerSentence', 'pronounsRate', 'differentPronounCount', 'differentPronounsPerSentence', 'differentPronounsRate', 'differentPronounsDifferentWordsRatio', 'adjectiveCount', 'adjectivesPerSentence', 'adjectivesRate', 'differentAdjectiveCount', 'differentAdjectivesPerSentence', 'differentAdjectivesRate', 'differentAdjectivesDifferentWordsRatio', 'adverbCount', 'adverbsPerSentence', 'adverbsRate', 'differentAdverbCount', 'differentAdverbsPerSentence', 'differentAdverbsRate', 'differentAdverbsDifferentWordsRatio', 'coordinatingConjunctionCount', 'coordinatingConjunctionsPerSentence', 'coordinatingConjunctionsRate', 'differentCoordinatingConjunctionCount', 'differentCoordinatingConjunctionsPerSentence', 'differentCoordinatingConjunctionsRate', 'differentCoordinatingConjunctionsDifferentWordsRatio', 'subordinatingPrepositionAndConjunctionCount', 'subordinatingPrepositionsAndConjunctionsPerSentence', 'subordinatingPrepositionsAndConjunctionsRate', 'differentSubordinatingPrepositionAndConjunctionCount', 'differentSubordinatingPrepositionsAndConjunctionsPerSentence', 'differentSubordinatingPrepositionsAndConjunctionsRate', 'differentSubordinatingPrepositionsAndConjunctionsDifferentWordsRatio', 'syllablesPerWord', 'charactersPerWord', 'NNP,NNP,NNP', 'VBD,DT,JJ', 'IN,DT,NNP', 'NNP,IN,DT', 'DT,NNP,NNP', 'JJ,NN,IN', 'NN,IN,DT', 'IN,DT,NN', 'NN,IN,NNP', 'IN,NNP,NNP', 'NNP,VBD,DT', 'VBD,DT,NN', 'DT,NN,IN', 'VBD,VBN,IN', 'NNP,NNP,VBD', 'IN,NN,IN', 'NNP,NNP,IN', 'NNP,IN,NNP', 'VBD,IN,DT', 'IN,DT,JJ', 'JJ,NNS,IN', 'DT,JJ,NN', 'IN,DT,NNS', 'IN,CD,NNP', 'VBN,IN,DT', 'DT,NN,NN', 'IN,PRP$,NN', 'NNP,VBD,VBN', 'NNP,CC,NNP', 'NNS,IN,DT', 'NN,IN,NN', 'DT,NN,VBD', 'NN,VBD,VBN', 'TO,VB,DT', 'NNP,POS,NN', 'ter', 'er_', '_wa', 'was', 'as_', 's_a', '_a_', 'an_', 'e_a', '_an', 'and', 'nd_', '_re', 'ent', '_of', 'of_', 'f_t', '_th', 'the', 'he_', 'on_', ',_a', 'at_', 'ed_', '_on', 'n_t', 'or_', 'ing', 'ng_', '_in', 'in_', 'd_t', 'd_a', '_he', '_to', 'ted', 'th_', 'al_', 'es_', 'ate', '_co', 'ion', 'ere', '_fo', 'for', 's,_', 'to_', 'ati', 'st_', 're_', '_be', 'ly_', 'her', '_hi', 'his', 'is_', 'e_t', 'en_', 'e_o', 't_t', 'tio', '_Th', 'age', 'agePerReview', 'reviewPerDay', 'reviewsPerUser', 'reviewsPerUserStdDev', 'discussionCount', 'reviewCount', 'registeredReviewCount', 'anonymouseReviewCount', 'registeredReviewRate', 'anonymouseReviewRate', 'registeredAnonymouseReviewRatio', 'userCount', 'occasionalUserCount', 'occasionalUserRate', 'registeredUserCount', 'anonymouseUserCount', 'registerdAnonymouseUserRatio', 'registeredUserRate', 'anonymouseUserRate', 'revertCount', 'revertReviewRatio', 'diversity', 'modifiedLinesRate', 'mostActiveUsersReviewCount', 'mostActiveUsersReviewRate', 'lastThreeMonthsReviewCount', 'lastThreeMonthsReviewRate']

# Select only the columns corresponding to the features in the list
X = data[features_cols]

X.sample(frac=1)

# Select qualityClass as the response (y)
y = data.qualityClass

# Select only the columns corresponding to the features in the list
X = data[features_cols]

X.sample(frac=1)

# Select qualityClass as the response (y)
y = data.qualityClass

print('\n')
print('Decision Tree')
print('\n')
# 10-fold cross-validation with decision tree PREDICTIONS
clf = DecisionTreeClassifier(random_state=8)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('KNN')
print('\n')
# 10-fold cross-validation with knn PREDICTIONS
clf = KNeighborsClassifier(n_jobs=7, n_neighbors=49) # NORMAL
# clf = KNeighborsClassifier(n_jobs=7, n_neighbors=25) # PARALLEL
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Logistic Regression')
print('\n')
# 10-fold cross-validation with logistic regression PREDICTIONS
clf = LogisticRegression(n_jobs=7)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Naive Bayes')
print('\n')
# 10-fold cross-validation with naive bayes PREDICTIONS
clf = GaussianNB()
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Random Forest')
print('\n')
# 10-fold cross-validation with random forst PREDICTIONS
clf = RandomForestClassifier(n_jobs=7, n_estimators=200, random_state=5, class_weight='auto')
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Support Vector Classifier')
print('\n')
# 10-fold cross-validation with support vector classifier PREDICTIONS
clf = LinearSVC(dual=False)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))

print('\n')
print('XGBoost')
print('\n')
# fit model no training data
model = XGBClassifier(max_depth=3, n_estimators=1000, n_jobs=7)

# 10-fold cross-validation with decision tree PREDICTIONS
y_pred = cross_val_predict(model, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred) ))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))

My approach


Decision Tree


             precision    recall  f1-score   support

          1       0.59      0.62      0.61       400
          2       0.33      0.31      0.32       400
          3       0.25      0.24      0.24       400
          4       0.30      0.31      0.30       400
          5       0.37      0.37      0.37       400
          6       0.35      0.36      0.36       400
          7       0.43      0.41      0.42       400

avg / total       0.37      0.38      0.37      2800

Accuracy: 0.37535714285714283
MSE: 2.0021428571428572


KNN


             precision    recall  f1-score   support

          1       0.66      0.84      0.74       400
          2       0.43      0.35      0.38       400
          3       0.21      0.07      0.11       400
          4       0.33      0.23      0.27       400
          5       0.37      0.39      0.38       400
          6       0.34      0.39      0.36       400
          7       0.40      0.66      0.49       400

av

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


             precision    recall  f1-score   support

          1       0.73      0.78      0.75       400
          2       0.46      0.45      0.46       400
          3       0.41      0.36      0.38       400
          4       0.42      0.41      0.41       400
          5       0.53      0.47      0.50       400
          6       0.49      0.52      0.50       400
          7       0.63      0.72      0.67       400

avg / total       0.52      0.53      0.53      2800

Accuracy: 0.5303571428571429
MSE: 1.0482142857142858


  if diff:


In [3]:
from sklearn.preprocessing import Normalizer
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import cross_val_predict
import csv
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
# Import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

# Change it with the name of your dataset
filename = 'MHParallelDataset.csv'

# Extract columns names (fieldnames)
with open(filename, 'r') as infile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames

# Remove last column (qualityClass)
fieldnames.pop()

# IMPORT DATASET
data = pd.read_csv(open(filename))

# My approach
print('My approach - Second Dataset')

# Feature list
features_cols = ['characterCount', 'wordCount', 'syllableCount', 'sentenceCount', 'sectionCount', 'subsectionCount', 'paragraphCount', 'meanSectionSize', 'meanParagraphSize', 'largestSectionSize', 'shortestSectionSize', 'largestShortestSectionRatio', 'sectionSizeStandardDeviation', 'meanOfSubsectionsPerSection', 'abstractSize', 'abstractSizeArtcileLengthRatio', 'citationCount', 'citationCountPerTextLength', 'citationCountPerSection', 'externalLinksCount', 'externalLinksPerTextLength', 'externalLinksPerSection', 'imageCount', 'imagePerTextLength', 'imagePerSection', 'meanSentenceSize', 'largestSentenceSize', 'shortestSentenceSize', 'largeSentenceRate', 'shortSentenceRate', 'questionCount', 'questionRatio', 'exclamationCount', 'exclamationRatio', 'toBeVerbCount', 'toBeVerbRatio', 'toBeVerbPerSentence', 'toBeVerbRate', 'modalAuxiliaryVerbCount', 'modalAuxiliaryVerbsRatio', 'modalAuxiliaryVerbsPerSentence', 'modalAuxiliaryVerbsRate', 'passiveVoiceCount', 'passiveVoiceRatio', 'passiveVoicePerSentence', 'passiveVoiceRate', 'numberOfSentencesThatStartWithACoordinatingConjunction', 'numberOfSentencesThatStartWithADeterminer', 'numberOfSentencesThatStartWithASubordinatingPrepositionOrConjunction', 'numberOfSentencesThatStartWithAnAdjective', 'numberOfSentencesThatStartWithANoun', 'numberOfSentencesThatStartWithAPronoun', 'numberOfSentencesThatStartWithAnAdverb', 'numberOfSentencesThatStartWithAnArticle', 'numberOfSentencesThatStartWithACoordinatingConjunctionRatio', 'numberOfSentencesThatStartWithADeterminerRatio', 'numberOfSentencesThatStartWithASubordinatingPrepositionOrConjunctionRatio', 'numberOfSentencesThatStartWithAnAdjectiveRatio', 'numberOfSentencesThatStartWithANounRatio', 'numberOfSentencesThatStartWithAPronounRatio', 'numberOfSentencesThatStartWithAnAdverbRatio', 'numberOfSentencesThatStartWithAnArticleRatio', 'automatedReadabilityIndex', 'colemanLiauIndex', 'fleshReadingEase', 'fleschKincaidGradeLevel', 'gunningFogIndex', 'lasbarhetsIndex', 'smogGrading', 'daleChallReadabilityFormula', 'differentWordCount', 'differentWordsPerSentence', 'differentWordsRate', 'nounCount', 'nounsPerSentence', 'nounsRate', 'differentNounCount', 'differentNounsPerSentence', 'differentNounsRate', 'differentNounsDifferentWordsRatio', 'verbCount', 'verbsPerSentence', 'verbsRate', 'differentVerbCount', 'differentVerbsPerSentence', 'differentVerbsRate', 'differentVerbsDifferentWordsRatio', 'pronounCount', 'pronounsPerSentence', 'pronounsRate', 'differentPronounCount', 'differentPronounsPerSentence', 'differentPronounsRate', 'differentPronounsDifferentWordsRatio', 'adjectiveCount', 'adjectivesPerSentence', 'adjectivesRate', 'differentAdjectiveCount', 'differentAdjectivesPerSentence', 'differentAdjectivesRate', 'differentAdjectivesDifferentWordsRatio', 'adverbCount', 'adverbsPerSentence', 'adverbsRate', 'differentAdverbCount', 'differentAdverbsPerSentence', 'differentAdverbsRate', 'differentAdverbsDifferentWordsRatio', 'coordinatingConjunctionCount', 'coordinatingConjunctionsPerSentence', 'coordinatingConjunctionsRate', 'differentCoordinatingConjunctionCount', 'differentCoordinatingConjunctionsPerSentence', 'differentCoordinatingConjunctionsRate', 'differentCoordinatingConjunctionsDifferentWordsRatio', 'subordinatingPrepositionAndConjunctionCount', 'subordinatingPrepositionsAndConjunctionsPerSentence', 'subordinatingPrepositionsAndConjunctionsRate', 'differentSubordinatingPrepositionAndConjunctionCount', 'differentSubordinatingPrepositionsAndConjunctionsPerSentence', 'differentSubordinatingPrepositionsAndConjunctionsRate', 'differentSubordinatingPrepositionsAndConjunctionsDifferentWordsRatio', 'syllablesPerWord', 'charactersPerWord', 'NNP,NNP,NNP', 'VBD,DT,JJ', 'IN,DT,NNP', 'NNP,IN,DT', 'DT,NNP,NNP', 'JJ,NN,IN', 'NN,IN,DT', 'IN,DT,NN', 'NN,IN,NNP', 'IN,NNP,NNP', 'NNP,VBD,DT', 'VBD,DT,NN', 'DT,NN,IN', 'VBD,VBN,IN', 'NNP,NNP,VBD', 'IN,NN,IN', 'NNP,NNP,IN', 'NNP,IN,NNP', 'VBD,IN,DT', 'IN,DT,JJ', 'JJ,NNS,IN', 'DT,JJ,NN', 'IN,DT,NNS', 'IN,CD,NNP', 'VBN,IN,DT', 'DT,NN,NN', 'IN,PRP$,NN', 'NNP,VBD,VBN', 'NNP,CC,NNP', 'NNS,IN,DT', 'NN,IN,NN', 'DT,NN,VBD', 'NN,VBD,VBN', 'TO,VB,DT', 'NNP,POS,NN', 'ter', 'er_', '_wa', 'was', 'as_', 's_a', '_a_', 'an_', 'e_a', '_an', 'and', 'nd_', '_re', 'ent', '_of', 'of_', 'f_t', '_th', 'the', 'he_', 'on_', ',_a', 'at_', 'ed_', '_on', 'n_t', 'or_', 'ing', 'ng_', '_in', 'in_', 'd_t', 'd_a', '_he', '_to', 'ted', 'th_', 'al_', 'es_', 'ate', '_co', 'ion', 'ere', '_fo', 'for', 's,_', 'to_', 'ati', 'st_', 're_', '_be', 'ly_', 'her', '_hi', 'his', 'is_', 'e_t', 'en_', 'e_o', 't_t', 'tio', '_Th', 'age', 'agePerReview', 'reviewPerDay', 'reviewsPerUser', 'reviewsPerUserStdDev', 'discussionCount', 'reviewCount', 'registeredReviewCount', 'anonymouseReviewCount', 'registeredReviewRate', 'anonymouseReviewRate', 'registeredAnonymouseReviewRatio', 'userCount', 'occasionalUserCount', 'occasionalUserRate', 'registeredUserCount', 'anonymouseUserCount', 'registerdAnonymouseUserRatio', 'registeredUserRate', 'anonymouseUserRate', 'revertCount', 'revertReviewRatio', 'diversity', 'modifiedLinesRate', 'mostActiveUsersReviewCount', 'mostActiveUsersReviewRate', 'lastThreeMonthsReviewCount', 'lastThreeMonthsReviewRate']

# Select only the columns corresponding to the features in the list
X = data[features_cols]

X.sample(frac=1)

# Select qualityClass as the response (y)
y = data.qualityClass

# Select only the columns corresponding to the features in the list
X = data[features_cols]

X.sample(frac=1)

# Select qualityClass as the response (y)
y = data.qualityClass

print('\n')
print('Decision Tree')
print('\n')
# 10-fold cross-validation with decision tree PREDICTIONS
clf = DecisionTreeClassifier(random_state=8)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('KNN')
print('\n')
# 10-fold cross-validation with knn PREDICTIONS
clf = KNeighborsClassifier(n_jobs=7, n_neighbors=49) # NORMAL
# clf = KNeighborsClassifier(n_jobs=7, n_neighbors=25) # PARALLEL
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Logistic Regression')
print('\n')
# 10-fold cross-validation with logistic regression PREDICTIONS
clf = LogisticRegression(n_jobs=7)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Naive Bayes')
print('\n')
# 10-fold cross-validation with naive bayes PREDICTIONS
clf = GaussianNB()
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Random Forest')
print('\n')
# 10-fold cross-validation with random forst PREDICTIONS
clf = RandomForestClassifier(n_jobs=7, n_estimators=200, random_state=5, class_weight='auto')
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))


print('\n')
print('Support Vector Classifier')
print('\n')
# 10-fold cross-validation with support vector classifier PREDICTIONS
clf = LinearSVC(dual=False)
y_pred = cross_val_predict(clf, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred)))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))

print('\n')
print('XGBoost')
print('\n')
# fit model no training data
model = XGBClassifier(max_depth=3, n_estimators=1000, n_jobs=7)

# 10-fold cross-validation with decision tree PREDICTIONS
y_pred = cross_val_predict(model, X, y, cv=20)

print(metrics.classification_report(y, y_pred))
print('Accuracy: ' + str(metrics.accuracy_score(y, y_pred) ))
print('MSE: ' + str(metrics.mean_squared_error(y, y_pred)))

My approach - Second Dataset


Decision Tree


             precision    recall  f1-score   support

          1       0.69      0.72      0.71       400
          2       0.41      0.39      0.40       400
          3       0.36      0.37      0.37       400
          4       0.35      0.33      0.34       400
          5       0.43      0.42      0.43       400
          6       0.38      0.39      0.39       400
          7       0.53      0.54      0.53       400

avg / total       0.45      0.45      0.45      2800

Accuracy: 0.4525
MSE: 1.5707142857142857


KNN


             precision    recall  f1-score   support

          1       0.67      0.89      0.76       400
          2       0.45      0.33      0.38       400
          3       0.29      0.14      0.19       400
          4       0.34      0.27      0.30       400
          5       0.39      0.39      0.39       400
          6       0.34      0.40      0.37       400
          7       0.42      0.63      0.50       400

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


             precision    recall  f1-score   support

          1       0.75      0.79      0.77       400
          2       0.54      0.56      0.55       400
          3       0.48      0.43      0.46       400
          4       0.48      0.44      0.46       400
          5       0.59      0.54      0.56       400
          6       0.54      0.58      0.56       400
          7       0.67      0.74      0.70       400

avg / total       0.58      0.58      0.58      2800

Accuracy: 0.5842857142857143
MSE: 0.9128571428571428


  if diff:
