In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



In [2]:
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]

In [3]:
data = pd.read_csv('data.csv')
data.fillna("",inplace = True)

In [4]:
data.head()

Unnamed: 0,author,authors,body,comment,comms_num,created,flair,id,score,title,url,combined_features
0,dhavalcoholic,ICICIPruLifeIns,reposting lack activity r askindiahello last y...,dear policy holder dhavalcoholic request help ...,1,1386254000.0,AskIndia,1s57oi,1,need feedback insurance policy took xpost aski...,https://www.reddit.com/r/india/comments/1s57oi...,need feedback insurance policy took xpost aski...
1,amitkumarthakur,RAD-Business RAD-Business None barcam10 _snor...,24hrs local police station register case dont ...,calm downgo sp office town file grievance imme...,24,1554080000.0,AskIndia,b7pvwt,94,somebody want kill full family,https://www.reddit.com/r/india/comments/b7pvwt...,somebody want kill full familycalm downgo sp o...
2,FrustratedOCIHopeful,plshelpthedog ayyylmaaaoo Proper_Boysenberry ...,hello askindia first time poster long time lur...,honestly supervisor behaved exactly government...,27,1555361000.0,AskIndia,bdfid1,10,ambassador india takes back newly issued oci c...,https://www.reddit.com/r/india/comments/bdfid1...,ambassador india takes back newly issued oci c...
3,aloo_vs_bhaloo,vcdarklord tilismilis aloo_vs_bhaloo dogaa fo...,r tooafraidtoask india edition,modi control sex desires jerk someone else pro...,22,1566529000.0,AskIndia,cu1xn4,18,randians afraid ask,https://www.reddit.com/r/india/comments/cu1xn4...,randians afraid askmodi control sex desires je...
4,multubunu,,hello submitted r raskindia week ago got answe...,,0,1361085000.0,AskIndia,18ntue,0,askindia cingari cengar tzengar,https://www.reddit.com/r/india/comments/18ntue...,askindia cingari cengar tzengarhttps://www.red...


In [9]:
def logisticreg(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))


In [10]:
def randomforest(X_train, X_test, y_train, y_test):
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [11]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
   
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)

In [12]:
cat = data.flair

V = data.combined_features
W = data.comment
X = data.title
Y = data.body
Z = data.url

print("Flair Detection using Title as Feature")
train_test(X,cat)
print("Flair Detection using Combined Features")
train_test(V,cat)

Flair Detection using Title as Feature
Results of Random Forest
accuracy 0.9040178571428571
                    precision    recall  f1-score   support

          AskIndia       0.94      0.91      0.93        34
     Non-Political       0.78      0.93      0.85        30
     [R]eddiquette       0.95      0.80      0.86        44
         Scheduled       0.87      0.89      0.88        38
       Photography       0.96      0.98      0.97        45
Science/Technology       0.94      0.96      0.95        47
          Politics       0.85      0.97      0.90        34
  Business/Finance       0.85      0.95      0.90        37
    Policy/Economy       0.95      0.87      0.91        46
            Sports       1.00      0.83      0.91        48
              Food       0.84      0.86      0.85        37
               AMA       0.89      1.00      0.94         8

          accuracy                           0.90       448
         macro avg       0.90      0.91      0.90       448
      

In [13]:
X_train, X_test, y_train, y_test = train_test_split(V, cat, test_size=0.2, random_state = 42)
ranfor = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                  ])
RM = ranfor.fit(X_train, y_train)
pickle.dump(RM,open("RM.pkl",'wb'))
y_pred = ranfor.predict(X_test)

In [14]:
import nltk