In [1]:
import json as j
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
json_data = None
data = pd.read_csv("data_large_5000.csv")

In [6]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

data['tweets'] = data['tweets'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


In [7]:
x = data.tweets
y = data.tagname

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [9]:
pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])

In [10]:
model = pipeline.fit(X_train, y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [11]:
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

In [26]:
predict = model.predict(X_test)

In [28]:
import os
import time
import sys

dirName = 'results'
 
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

## predictions for first 10 test samples
text_hollywood = ''
text_bollywood = ''
text_ml = ''
text_football = ''
text_bigg_boss = ''
text_food = '' 
text_hp_day = ''
text_politics = ''
text_mobiles = ''
text_cricket = ''
counter  = 0
for doc, category in zip(X_test, predict):
    if category == "hollywood":
        f= open('results/'+ category+".txt","w+")
        if (text_hollywood != ""):
            text_hollywood = text_hollywood + '\n' + doc
        else:
            text_hollywood = doc
        f.write(text_hollywood)
        f.close()
    if category == "bollywood":
        f= open('results/'+ category +".txt","w+")
        if (text_bollywood != ""):
            text_bollywood = text_bollywood + '\n' + doc
        else:
            text_bollywood = doc
        f.write(text_bollywood)
        f.close()
    if category == "machine learning":
        f= open('results/'+ category +".txt","w+")
        if (text_ml != ""):
            text_ml = text_ml + '\n' + doc
        else:
            text_ml = doc
        f.write(text_ml)
        f.close()
    if category == "football":
        f= open('results/'+ category +".txt","w+")
        if (text_football != ""):
            text_football = text_football + '\n' + doc
        else:
            text_football = doc
        f.write(text_football)
        f.close()
    if category == "bigg boss":
        f= open('results/'+ category +".txt","w+")
        if (text_bigg_boss != ""):
            text_bigg_boss = text_bigg_boss + '\n' + doc
        else:
            text_bigg_boss = doc
        f.write(text_bigg_boss)
        f.close()
    if category == "food":
        f= open('results/'+ category +".txt","w+")
        if (text_food != ""):
            text_food = text_food + '\n' + doc
        else:
            text_food = doc
        f.write(text_food)
        f.close()
    if category == "happy birthday":
        f= open('results/'+ category +".txt","w+")
        if (text_hp_day != ""):
            text_hp_day = text_hp_day + '\n' + doc
        else:
            text_hp_day = doc
        f.write(text_hp_day)
        f.close() 
    if category == "politics":
        f= open('results/'+ category +".txt","w+")
        if (text_politics != ""):
            text_politics = text_politics + '\n' + doc
        else:
            text_politics = doc
        f.write(text_politics)
        f.close() 
    if category == "mobiles":
        f= open('results/'+category+".txt","w+")
        if (text_mobiles != ""):
            text_mobiles = text_mobiles + '\n' + doc
        else:
            text_mobiles = doc
        f.write(text_mobiles)
        f.close()
    if category == "cricket":
        f= open('results/'+ category +".txt","w+")
        if (text_cricket != ""):
            text_cricket = text_cricket + '\n' + doc
        else:
            text_cricket = doc
        f.write(text_cricket)
        f.close()



Directory  results  already exists


# Top 5  Sentence

In [30]:
target_names = ['1', '2', '3', '4', '5']
print("top 10 keywords per class:")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

print("accuracy score: " + str(model.score(X_test, y_test)))

top 10 keywords per class:
1: ko goosebump hina bigg shukla india bigg boss asim shilpa shilpashind bb
2: kartikaaryan birthday boy team firstbuzz marjaavaan khan cineplex sanayairani aishwaryaraibachchan salmankhan bollywood
3: ipl bharatarmyaward sanjusamson bringbackdhoni west indi sanju selector indvssl odi teamindia
4: italian boston bruin italianfood food itali pizza lezzet recip spaghetti pasta
5: arsenal liverpool sack spur frank footbal tottenham giroud mourinho chelsea
accuracy score: 0.9498583569405099
