In [1]:
from sklearn import preprocessing

import numpy as np
import pandas as pd

import os

from english_words import english_words_lower_alpha_set # set of most valid 25k english words

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
final_df = pd.read_csv('final.csv')
final_df

Unnamed: 0.1,Unnamed: 0,id,abstract,labels
0,0,0,"Turing machines and G\""odel numbers are import...",cs
1,1,1,RNA-sequencing has revolutionized biomedical r...,stat
2,2,2,Queuing models provide insight into the tempor...,physics
3,3,3,"In a multiple-object auction, every bidder tri...",cs
4,4,4,In arXiv:1109.6438v1 [math.AG] we introduced a...,math
...,...,...,...,...
86204,86204,86207,"Based on Grad-Shafranov-like equations, a gyro...",physics
86205,86205,86208,Persistent homology is a method for probing to...,cs
86206,86206,86209,We study a class of simply connected manifolds...,math
86207,86207,86210,"In this paper, we resolve the computational co...",cs


In [3]:
final_df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
final_df

Unnamed: 0,id,abstract,labels
0,0,"Turing machines and G\""odel numbers are import...",cs
1,1,RNA-sequencing has revolutionized biomedical r...,stat
2,2,Queuing models provide insight into the tempor...,physics
3,3,"In a multiple-object auction, every bidder tri...",cs
4,4,In arXiv:1109.6438v1 [math.AG] we introduced a...,math
...,...,...,...
86204,86207,"Based on Grad-Shafranov-like equations, a gyro...",physics
86205,86208,Persistent homology is a method for probing to...,cs
86206,86209,We study a class of simply connected manifolds...,math
86207,86210,"In this paper, we resolve the computational co...",cs


In [5]:
X = final_df['abstract']
y = final_df['labels']

In [6]:
le = preprocessing.LabelEncoder()
labels = le.fit_transform(y)

In [7]:
le.classes_

array(['cs', 'math', 'physics', 'stat'], dtype=object)

In [8]:
cv = CountVectorizer(stop_words='english')
vectors = cv.fit_transform(X)

In [9]:
print(vectors)
print(labels)

  (0, 99450)	5
  (0, 58575)	2
  (0, 69192)	2
  (0, 68581)	1
  (0, 48719)	1
  (0, 74443)	1
  (0, 96653)	2
  (0, 23248)	3
  (0, 23249)	1
  (0, 11212)	1
  (0, 66239)	1
  (0, 81929)	1
  (0, 91352)	1
  (0, 48672)	1
  (0, 75986)	1
  (0, 20312)	1
  (0, 48666)	1
  (0, 101053)	1
  (0, 66544)	2
  (0, 38248)	4
  (0, 34784)	1
  (0, 34288)	1
  (0, 34111)	1
  (0, 18733)	1
  (0, 94614)	1
  :	:
  (86208, 8669)	2
  (86208, 56781)	1
  (86208, 105559)	3
  (86208, 12461)	1
  (86208, 50401)	1
  (86208, 90226)	1
  (86208, 17395)	1
  (86208, 8048)	1
  (86208, 59356)	1
  (86208, 17351)	1
  (86208, 85470)	1
  (86208, 102790)	1
  (86208, 77588)	1
  (86208, 18803)	1
  (86208, 55555)	2
  (86208, 43406)	1
  (86208, 97739)	1
  (86208, 68152)	1
  (86208, 23431)	1
  (86208, 9600)	1
  (86208, 11295)	1
  (86208, 17762)	1
  (86208, 8701)	1
  (86208, 33946)	1
  (86208, 45014)	1
[0 3 2 ... 1 0 0]


In [73]:
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.30)

In [98]:
mlp = MLPClassifier()
mlp.fit(X_train,y_train)
preds_mlp = mlp.predict(X_test)



In [99]:
accuracy_score(preds_mlp, y_test)

0.8947531222209334

In [76]:
print(classification_report(preds_mlp, y_test))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      8586
           1       0.94      0.91      0.93      6651
           2       0.93      0.92      0.93      7655
           3       0.74      0.82      0.78      2971

    accuracy                           0.89     25863
   macro avg       0.87      0.88      0.88     25863
weighted avg       0.90      0.89      0.89     25863



In [100]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
preds_lr = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [101]:
accuracy_score(preds_lr, y_test)

0.8750338321153772

In [102]:
print(classification_report(preds_lr, y_test))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      8494
           1       0.93      0.90      0.92      6695
           2       0.92      0.92      0.92      7542
           3       0.71      0.75      0.73      3132

    accuracy                           0.88     25863
   macro avg       0.85      0.86      0.86     25863
weighted avg       0.88      0.88      0.88     25863



In [109]:
test = ["""The present paper is a sample survey analysis, examined based on correlation techniques. The usage of
mobile phones is clearly almost un-avoidable these days and as such the authors have made a systematic
survey through a well prepared questionnaire on making use of mobile phones to the maximum extent.
These samples are various economical groups across a population of over one-lakh people. The results
are scientifically categorized and interpreted to match the ground reality. 

"""]

In [110]:
test = cv.transform(test)

In [111]:
names = np.array(['cs', 'math', 'physics', 'stat'])

In [112]:
result1 = mlp.predict(test)
result2 = lr.predict(test)

In [113]:
print(names[result2],names[result1])

['cs'] ['cs']


In [114]:
import pickle
pickle.dump(mlp,open('classifier.sav','wb'))
pickle.dump(cv,open('cv.sav','wb'))