# Import Module

In [2]:
# read in some helpful libraries
import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np


# Read CSV

In [3]:
# Loading in the training data with Pandas
df_train = pd.read_csv("data_large_5000.csv")

# Show Data Table

In [4]:
# look at the first few rows and how the text looks like
print (df_train['tweets'][2]) , '\n'
df_train.head()

 ALE TeamIndia for the upcoming series against West Indies announced INDvWI


Unnamed: 0,tweets,tagname
0,amp TeamIndia IndvsBan DayandNightTest PinkBal...,cricket
1,BharatArmyAwards Here are the nominees for th...,cricket
2,ALE TeamIndia for the upcoming series against...,cricket
3,Getting into PinkBallTest mode TeamIndia INDvBAN,cricket
4,Umpiring in the first ever pink ball match at...,cricket


# Check Dimensions ,Check Null Value and is sum and data type

In [5]:
## check the dimensions of the table
print ("Shape:", df_train.shape, '\n')

## Check if there is any NULL values inside the dataset
print ("Null Value Statistics:", '\n \n', df_train.isnull().sum()) ## Sum will tell the total number of NULL values inside the dataset
print ('\n')

## Explore the data types of your dataset
print ("Data Type of All Columns:" '\n \n', df_train.dtypes)

Shape: (35300, 2) 

Null Value Statistics: 
 
 tweets     0
tagname    0
dtype: int64


Data Type of All Columns:
 
 tweets     object
tagname    object
dtype: object


# Check tagname (unique)

In [6]:
## Collect all unique author names from author column
tag_names = df_train['tagname'].unique()
print (tag_names)

['cricket' 'mobiles' 'food' 'machine learning' 'hollywood'
 'happy birthday' 'football' 'bollywood' 'bigg boss' 'politics']


# Give specific id

In [7]:
tag_name_id = {}
assign_id = 0
for name in tag_names:
    tag_name_id[name] = assign_id
    assign_id += 1  ## Get a new id for new author
    
##  Print the dictionary created
for key, values in tag_name_id.items():
    print (key, values)

cricket 0
mobiles 1
food 2
machine learning 3
hollywood 4
happy birthday 5
football 6
bollywood 7
bigg boss 8
politics 9


# Change key to value and value to key

In [8]:
id_to_tag_name = {v: k for k, v in tag_name_id.items()}
for key, values in id_to_tag_name.items():
    print (key, values)

0 cricket
1 mobiles
2 food
3 machine learning
4 hollywood
5 happy birthday
6 football
7 bollywood
8 bigg boss
9 politics


# Add new column in csv

In [9]:
## Add a new column to pandas dataframe, with the author name mapping
def get_author_id(tag_name):
    return tag_name_id[tag_name]

df_train['tag_id'] = df_train['tagname'].map(get_author_id)

# Show data(after adding)

In [10]:
df_train.head()

Unnamed: 0,tweets,tagname,tag_id
0,amp TeamIndia IndvsBan DayandNightTest PinkBal...,cricket,0
1,BharatArmyAwards Here are the nominees for th...,cricket,0
2,ALE TeamIndia for the upcoming series against...,cricket,0
3,Getting into PinkBallTest mode TeamIndia INDvBAN,cricket,0
4,Umpiring in the first ever pink ball match at...,cricket,0


# Create Function preprocessing of data

In [11]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

# apply funtion on data

In [12]:
df_train['tweets'] = df_train['tweets'].map(transformText)

# Print data after prepossing

In [13]:
## Print a couple of rows after the preprocessing of the data is done

print (df_train['tweets'][0] , '\n')
print (df_train['tweets'][1] , '\n')
print (df_train['tweets'][2])

amp teamindia indvsban dayandnighttest pinkballtest 

bharatarmyaward nomine men limit perform year hurri vote 

al teamindia upcom seri west indi announc indvwi


# Split data in training data(67%) and test data(33%)

In [14]:
## Split the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train['tweets'], df_train['tag_id'], test_size=0.33, random_state=42)
print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))
X_train

Training Sample Size: 23651   Test Sample Size: 11649


15652    musictv hollywood xma song need right hollywoo...
30260    humara munna kisi kam hai kya gonnatellmykid t...
34761    sana fam best fandom shilpashind fam full enjo...
24499    okai let win tottenham awai cfc chelsea superf...
27857    granturco lappel chelsea devant ta contr dcisi...
                               ...                        
16850    somebodi got stand ovat richard jewel premier ...
6265     fall pasta thanksgiv menu delici thanksgiv pas...
11284    machin learn realli technolog artificialintell...
860                get pinkballtest mode teamindia indvban
15795              mean receiv defer wwii would point govt
Name: tweets, Length: 23651, dtype: object

# Building the Mode

In [15]:
## Get the word vocabulary out of the data
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

## Count of 'mistak' in corpus (mistake -> mistak after stemming)
#print ('mistak appears:', count_vect.vocabulary_.get(u'mistak') , 'in the corpus')

(23651, 20591)

In [16]:
## Get the TF-IDF vector representation of the data
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (23651, 20591)


# Apply Classification

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# Prediction

In [18]:
## Prediction part

X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

# Get Result

In [19]:
## predictions for first 10 test samples

counter  = 0
for doc, category in zip(X_test, predicted):
    if id_to_tag_name[category] == "hollywood":
        pass
       
    print('%r => %s' % (doc, id_to_tag_name[category]))
    if(counter == 20000):
        break
    counter += 1

'loos respect buddi still stick rahul jadhav know player' => football
'popcorn pizza blanket hollywood movi contest fun moviejunki contest action' => hollywood
'felic ao para otro mundo posibl solo necesitan la gana capacidad crecer aprend' => football
'cours pmi acp exam prep pmbok projectmanag pmiacp javascript nodej css angul' => machine learning
'al teamindia upcom seri west indi announc indvwi' => cricket
'promot chelsea bad year sinc' => football
'articl discuss import explain examin black box horror stori transpar predict' => machine learning
'main area graphdatabas provid context amp detail new white' => machine learning
'local tourist girl vibez hollywood girlnightout joytrib style vibe maga fashion afro' => hollywood
'wait watch theskyispink priyankachopra priyanka priyankachoprajona nickjona' => bollywood
'ikeji jose mourinho yesterdai todai forev pl dont chang properchel chelsea' => football
'friend gui rememb film ghost insid real charm hous amp salem lot hous check' => bo

'somebodi got stand ovat richard jewel premier hollywood everyth fell place' => hollywood
'meadow pasta post well actual also mac chees kno purpl cloudmeadow gamedev erodev food pasta macandc' => food
'homemad white sauc pasta order inbox call whatsapp pasta food lahor wintersess' => food
'silver cat ring fashion style love instagood like ootd moda fashionblogg model photogr' => happy birthday
'kolkata gear pinkballtest teamindia indvban' => cricket
'india becom world leader artificialintelliegnc vishal sikka hindu' => machine learning
'eden hazard chelsea season far chelsea parti without' => football
'team format useless select' => cricket
'lunch time bungalowcaf mushroomcheeseomelett omelett matcha pandes donut doughnut lecheflan' => food
'prep pinkballtest underwai teamindia indvban' => cricket
'trailer fabul tanhajitrail hope world class movi ajaydevgn bollywood sho' => bollywood
'biggest superstar khan present bollywood bollywoodceleb movi actor' => bollywood
'trade bollywood succ

'al teamindia upcom seri west indi announc indvwi' => cricket
'pasta on skillet dinner februari bonappetit magazin' => food
'mourinho champion leagu chelsea amp inter milan porto amp inter milan' => football
'love thick farmstyl shred chees like give flavor bite great homeco' => food
'time favorit foodporn pasta shrimp yummywithatummi' => food
'somebodi got stand ovat richard jewel premier hollywood everyth fell place' => hollywood
'awesom sound podcast power' => bollywood
'amp follow chanc win funko hollywood exclus pop prize pack funko pop funkohollywood' => hollywood
'smart digit life infograph smartciti iot bigdata cybersecur healthcar educ infosec machinelearni' => machine learning
'le nouvel rgle frank lampard dan vestiair de blue donc pour retard lentran' => football
'color le encantara tener su mano del redmi note pro forest green miner grai pearl white lo leemo' => mobiles
'katvond katvondbeauti katvondartistrycollect vondsho highvoltagetattoo' => football
'darbar poster rajin

'husband who carnivor doesnt realiz he eat plantbas brownric pasta rapini' => food
'bigg boss weekendkavaarwithsalman updat bharti husband haarshlimbachiyaa stage mani kid' => bigg boss
'redmi note' => mobiles
'al teamindia upcom seri west indi announc indvwi' => cricket
'india bangladesh readi make histori much await dai night test match readi playbold' => cricket
'auguri biagio izzo che oggi compi anni ohio ohionotizi' => happy birthday
'com strike kovac inter milan first chelsea goal still pend cfc' => football
'wizard american film produc metro goldwyn mayer star among other judi garland music' => hollywood
'ranveer singh deepika padukon look simpli amaz visit tirupati wed anniversari deepikap' => bollywood
'romanc suspens well craft diverg storylin evenli pace surpris maintain suspens' => hollywood
'heart throb set woo audienc handsom hunk kartikaaryan hearthrob sexi aan' => bollywood
'teamindia readi pinkballtest indvban' => cricket
'supermodel hollywood fashion' => hollywood
'pi

'heavenli girl nikitadutta wednesdaymotiv diva heavenli bollywood' => bollywood
'pasta delight abuja abujatwittercommun pasta' => food
'chelsea date set chelsea appeal two window transfer ban' => football
'amp follow chanc win funko hollywood exclus pop prize pack funko pop funkohollywood' => hollywood
'sana fam best fandom shilpashind fam full enjoy abus tweet aag pang lete' => bigg boss
'favourit actress katrina kaif deepika padukon retweet katrina kaif like deepika padukon katr' => bollywood
'baaghi tigershroff new look reaction bollywood bollywoodnew bollyspi' => bollywood
'al teamindia upcom seri west indi announc indvwi' => cricket
'forev madhuridixit madhuridixitnen katrinakaif kritisanon sonakshisinha aliabhatt akshumkar ayushmannkhurrana amit' => bollywood
'pulipaka download ebook page pdf algebra differenti calculu computersci amp machinelearn bigdata' => machine learning
'big shout woliagba ayoajewol birthdai mai lord continu shine light countenanc' => happy birthday
'man ci

'incred dai sundai shoot someth super excit shoot shoot film behindthescen' => bollywood
'diyett olanlar izlemesin azda eriyen yle bir pasta yaptk diyet bozdurur' => food
'wait chelsea footbal club chelsea transferban stamfordbridg' => football
'nine england last debut graduat chelsea academi' => football
'best python cours udemi master python program complet python bootcamp' => machine learning
'kim kardashian basin less bathroom sink cost least like hollywood new' => hollywood
'quelqu soin beaut dernir minut salon commenc demain' => food
'job miami usa part time merchandis sa provid nation retail merchandis servic ho sa prov' => machine learning
'whatev choos count great tast high qualiti famili recip appreci choos waldo' => food
'said loud clear jose unit sai spur fuck jose mourinho' => football
'bollywood visit golden templ goldentempl amritsar bollywoodstar celebritynew spiritu' => bollywood
'al al fanessa serri fohn wchee shoot new film imagin judgment releas jan hollywood' => ho

'follow followback footi footbal keyr sport avfc mufc afc cfc mcfc chelsea villa manu manciti pl' => football
'make sure check apsychospathmovi support boi first commerci break brought' => bollywood
'love thick farmstyl shred chees like give flavor bite great homeco' => food
'go share origin layer look beauti thought love good' => bollywood
'niver ceclia nosso amigo parceiro feflausino noit super agradvel cheia amigo querido' => happy birthday
'upload jupyt notebook rstat exampl excel busi data scienc' => machine learning
'com strike kovac inter milan first chelsea goal still pend cfc' => football
'pandit jawaharlalnehru leader influenc fellow leader amp great person inspir nurtur tini litt' => happy birthday
'hollywood' => hollywood
'somebodi got stand ovat richard jewel premier hollywood everyth fell place' => hollywood
'que paulinarubio con lo rdito ltima demanda colat autoproduca mega film hollywood narrado' => hollywood
'truth convolut manag model complex neuralnetwork' => machine

'bollywood massiv islamophobia problem right need stop serious stop' => bollywood
'norton artificialintellig work court make highlight reel machinelearn' => machine learning
'sana fam best fandom shilpashind fam full enjoy abus tweet aag pan' => bigg boss
'hina khan share hack photo glamsham bollywood hack' => bollywood
'novembr moriva ennio flaiano maestro del novecento italiano ohio ohionotizi compleanno' => happy birthday
'pleas give big happi birthdai shout twin minni bat amp sox year old todai' => happy birthday
'al teamindia upcom seri west indi announc indvwi' => cricket
'gryzzi pasta realli make night' => food
'chicago student charg write code spread isi propaganda cybersecur databreach' => machine learning
'al teamindia upcom seri west indi announc indvwi' => cricket
'vanloon add trillion global economi artificialintellig machinelearn fint' => machine learning
'super fast weeknight dinner ham tetrazzini spin tradit version sure pleas whole fam' => food
'cucina italianrecip sog

In [20]:
np.mean(predicted == y_test) ## 80% sounds good only


0.9459181045583311

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
#clf = MultinomialNB().fit(X_train_tfidf, y_train)


SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)