# Создание и обучение моделей SVM(Support vector machine), Random Forest, Naive Bayes, Logistic Regression

### Подключаем необходимые модули

In [26]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm, naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Подгружаем train dataset

In [27]:
df = pd.read_csv('train.csv')
df.rename(columns={'Unnamed: 0':'№'}, inplace=True )
df.head()

Unnamed: 0,№,category,filename,content,category_id
0,0,math.AP,math.AP_37,"['bound', 'pseudodiﬀerenti', 'oper', 'wiener',...",0
1,1,math.AP,math.AP_88,"['mathematisch', 'nachrichten', 'june', 'regul...",0
2,2,math.AP,math.AP_63,"['kam', 'nonlinear', 'beam', 'equat', 'solut',...",0
3,3,math.AP,math.AP_67,"['one', 'dimension', 'quantum', 'zakharov', 's...",0
4,4,math.AP,math.AP_54,"['asymmetr', 'domain', 'wall', 'small', 'angl'...",0


### Разбиваем обучающую выборку на тренировочную и валидационную

In [28]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['content'], df['category_id'],test_size=0.2)

In [29]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [30]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['content'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [6]:
print(Tfidf_vect.vocabulary_)

{'bound': 477, 'oper': 2999, 'wiener': 4843, 'space': 4077, 'laurent': 2400, 'jean': 2256, 'abstract': 13, 'aim': 89, 'extend': 1516, 'deﬁnit': 1106, 'weyl': 4829, 'calculu': 542, 'inﬁnit': 2193, 'dimension': 1141, 'set': 3900, 'replac': 3634, 'phase': 3177, 'ﬁrst': 4972, 'approach': 202, 'gener': 1733, 'integr': 2149, 'use': 4695, 'wigner': 4845, 'function': 1698, 'symbol': 4339, 'deﬁn': 1105, 'belong': 380, 'gaussian': 1726, 'measur': 2622, 'quadrat': 3454, 'form': 1645, 'dens': 1060, 'subspac': 4248, 'exampl': 1471, 'stochast': 4182, 'extens': 1517, 'sens': 3878, 'gross': 1823, 'continu': 868, 'second': 3841, 'satisﬁ': 3784, 'diﬀerenti': 1202, 'condit': 827, 'analog': 142, 'ﬁnite': 4970, 'one': 2990, 'need': 2847, 'introduc': 2178, 'hybrid': 1990, 'act': 43, 'variabl': 4714, 'subset': 4245, 'rest': 3663, 'ﬁnal': 4962, 'limit': 2463, 'rather': 3531, 'weak': 4811, 'give': 1762, 'monomi': 2754, 'link': 2471, 'construct': 859, 'classic': 698, 'ator': 276, 'theori': 4437, 'relat': 3607, 

In [7]:
print(Train_X_Tfidf)

  (0, 4999)	0.03646304998975176
  (0, 4980)	0.00592904755648553
  (0, 4978)	0.0068230099002327405
  (0, 4977)	0.0035932528894421817
  (0, 4976)	0.01551208475184481
  (0, 4975)	0.007244877897641702
  (0, 4972)	0.0028520550680891515
  (0, 4966)	0.002910533573391333
  (0, 4965)	0.005303194030800465
  (0, 4962)	0.005109482036537906
  (0, 4959)	0.012843522174753647
  (0, 4956)	0.008034066055890188
  (0, 4950)	0.007633883595999377
  (0, 4898)	0.0038635160702162334
  (0, 4897)	0.0025474227914392385
  (0, 4889)	0.019281640694990027
  (0, 4879)	0.026903156163172435
  (0, 4877)	0.004808415136443551
  (0, 4872)	0.007271487036136323
  (0, 4862)	0.04358004221015644
  (0, 4840)	0.0026567940166420897
  (0, 4837)	0.010249436795617966
  (0, 4833)	0.005589598142304095
  (0, 4825)	0.00906497160800462
  (0, 4823)	0.0027168188734440213
  :	:
  (1199, 196)	0.0043770134135965935
  (1199, 188)	0.0059934328718787
  (1199, 161)	0.002197864239308043
  (1199, 155)	0.003979217552667981
  (1199, 142)	0.005895168709

### Обучим модель и посмотрим на меру accuracy на валидационной выборке

In [31]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  87.33333333333333


In [32]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",round(accuracy_score(predictions_NB, Test_Y)*100,3))

Naive Bayes Accuracy Score ->  83.0


In [33]:

# fit the training dataset on the classifier
RF = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

RF.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_RF = RF.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("RF Accuracy Score -> ",accuracy_score(predictions_RF, Test_Y)*100)

RF Accuracy Score ->  78.66666666666666


In [34]:
LR.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_LR = LR.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)



LR Accuracy Score ->  89.33333333333333


### Подгружаем test датасет

In [35]:
test = pd.read_csv('test.csv')
test.rename(columns={'Unnamed: 0':'№'}, inplace=True )
test.head()

Unnamed: 0,№,category,filename,content,category_id
0,0,math.AP,math.AP_37,"['global', 'stabil', 'equat', 'kerr', 'space',...",0
1,1,math.AP,math.AP_34,"['system', 'steep', 'potenti', 'well', 'indeﬁn...",0
2,2,math.AP,math.AP_23,"['global', 'lorentz', 'estim', 'natur', 'expon...",0
3,3,math.AP,math.AP_0,"['estim', 'dilat', 'map', 'giovanni', 'alessan...",0
4,4,math.AP,math.AP_4,"['new', 'exist', 'symmetri', 'result', 'least'...",0


In [36]:
Test_X = test.content
Test_Y = test.category_id

In [37]:
Encoder = LabelEncoder()
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['content'])
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [38]:
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  86.93333333333332


In [39]:
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("NB Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

NB Accuracy Score ->  84.13333333333334


In [40]:
predictions_RF = RF.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("RF Accuracy Score -> ",accuracy_score(predictions_RF, Test_Y)*100)

RF Accuracy Score ->  79.60000000000001


In [42]:
predictions_LR = LR.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)

LR Accuracy Score ->  86.66666666666667


In [44]:
columns_names = ['model_name','accuracy_score']
results = pd.DataFrame(columns = columns_names)
results.loc[0] = ['SVM', round(accuracy_score(predictions_SVM, Test_Y)*100,2)]
results.loc[1] = ['Random Forest', round(accuracy_score(predictions_RF, Test_Y)*100,2)]
results.loc[2] = ['Naive Bayes', round(accuracy_score(predictions_NB, Test_Y)*100,2)]
results.loc[3] = ['Logistic Regression', round(accuracy_score(predictions_LR, Test_Y)*100,2)]
results.to_csv(r'results.csv')