# Import Module

In [1]:
# read in some helpful libraries
import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np


# Read CSV

In [2]:
# Loading in the training data with Pandas
df_train = pd.read_csv("data_large_5000.csv")

# Show Data Table

In [3]:
# look at the first few rows and how the text looks like
print (df_train['tweets'][2]) , '\n'
df_train.head()

 ALE TeamIndia for the upcoming series against West Indies announced INDvWI


Unnamed: 0,tweets,tagname
0,amp TeamIndia IndvsBan DayandNightTest PinkBal...,cricket
1,BharatArmyAwards Here are the nominees for th...,cricket
2,ALE TeamIndia for the upcoming series against...,cricket
3,Getting into PinkBallTest mode TeamIndia INDvBAN,cricket
4,Umpiring in the first ever pink ball match at...,cricket


# Check Dimensions ,Check Null Value and is sum and data type

In [4]:
## check the dimensions of the table
print ("Shape:", df_train.shape, '\n')

## Check if there is any NULL values inside the dataset
print ("Null Value Statistics:", '\n \n', df_train.isnull().sum()) ## Sum will tell the total number of NULL values inside the dataset
print ('\n')

## Explore the data types of your dataset
print ("Data Type of All Columns:" '\n \n', df_train.dtypes)

Shape: (35300, 2) 

Null Value Statistics: 
 
 tweets     0
tagname    0
dtype: int64


Data Type of All Columns:
 
 tweets     object
tagname    object
dtype: object


# Check tagname (unique)

In [5]:
## Collect all unique author names from author column
tag_names = df_train['tagname'].unique()
print (tag_names)

['cricket' 'mobiles' 'food' 'machine learning' 'hollywood'
 'happy birthday' 'football' 'bollywood' 'bigg boss' 'politics']


# Give specific id

In [6]:
tag_name_id = {}
assign_id = 0
for name in tag_names:
    tag_name_id[name] = assign_id
    assign_id += 1  ## Get a new id for new author
    
##  Print the dictionary created
for key, values in tag_name_id.items():
    print (key, values)

cricket 0
mobiles 1
food 2
machine learning 3
hollywood 4
happy birthday 5
football 6
bollywood 7
bigg boss 8
politics 9


# Change key to value and value to key

In [7]:
id_to_tag_name = {v: k for k, v in tag_name_id.items()}
for key, values in id_to_tag_name.items():
    print (key, values)

0 cricket
1 mobiles
2 food
3 machine learning
4 hollywood
5 happy birthday
6 football
7 bollywood
8 bigg boss
9 politics


# Add new column in csv

In [8]:
## Add a new column to pandas dataframe, with the author name mapping
def get_author_id(tag_name):
    return tag_name_id[tag_name]

df_train['tag_id'] = df_train['tagname'].map(get_author_id)

# Show data(after adding)

In [9]:
df_train.head()

Unnamed: 0,tweets,tagname,tag_id
0,amp TeamIndia IndvsBan DayandNightTest PinkBal...,cricket,0
1,BharatArmyAwards Here are the nominees for th...,cricket,0
2,ALE TeamIndia for the upcoming series against...,cricket,0
3,Getting into PinkBallTest mode TeamIndia INDvBAN,cricket,0
4,Umpiring in the first ever pink ball match at...,cricket,0


# Create Function preprocessing of data

In [10]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

# apply funtion on data

In [11]:
df_train['tweets'] = df_train['tweets'].map(transformText)

# Print data after prepossing

In [12]:
## Print a couple of rows after the preprocessing of the data is done

print (df_train['tweets'][0] , '\n')
print (df_train['tweets'][1] , '\n')
print (df_train['tweets'][2])

amp teamindia indvsban dayandnighttest pinkballtest 

bharatarmyaward nomine men limit perform year hurri vote 

al teamindia upcom seri west indi announc indvwi


# Split data in training data(67%) and test data(33%)

In [13]:
## Split the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train['tweets'], df_train['tag_id'], test_size=0.33, random_state=42)
print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))
X_train

Training Sample Size: 23651   Test Sample Size: 11649


15652    musictv hollywood xma song need right hollywoo...
30260    humara munna kisi kam hai kya gonnatellmykid t...
34761    sana fam best fandom shilpashind fam full enjo...
24499    okai let win tottenham awai cfc chelsea superf...
27857    granturco lappel chelsea devant ta contr dcisi...
                               ...                        
16850    somebodi got stand ovat richard jewel premier ...
6265     fall pasta thanksgiv menu delici thanksgiv pas...
11284    machin learn realli technolog artificialintell...
860                get pinkballtest mode teamindia indvban
15795              mean receiv defer wwii would point govt
Name: tweets, Length: 23651, dtype: object

# Building the Mode

In [14]:
## Get the word vocabulary out of the data
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

## Count of 'mistak' in corpus (mistake -> mistak after stemming)
#print ('mistak appears:', count_vect.vocabulary_.get(u'mistak') , 'in the corpus')

(23651, 20591)

In [15]:
## Get the TF-IDF vector representation of the data
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (23651, 20591)


# Apply Classification

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# Prediction

In [17]:
## Prediction part

X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [20]:
df_train = pd.read_csv("check.csv")
x_tt = df_train.tweets

In [23]:
counter  = 0
for doc, category in zip(x_tt, predicted):
    print('%r => %s' % (doc, id_to_tag_name[category]))
    if(counter == 200):
        break
    counter += 1

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[' => football
'Layin n bed with a headache  ughhhh...waitin on your call...' => hollywood
'Funeral ceremony...gloomy friday...' => football
'wants to hang out with friends SOON!' => machine learning
'@dannycastillo We want to trade with someone who has Houston tickets, but no one will.' => cricket
"Re-pinging @ghostridah14: why didn't you go to prom? BC my bf didn't like my friends" => football
"I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous!" => machine learning
'Hmmm. http://www.djhero.com/ is down' => machine learning
'@charviray Charlene my love. I miss you' => hollywood
"@kelcouch I'm sorry  at least it's Friday?" => bollywood
'cant fall asleep' => football
'Choked on her retainers' => bollywood
'Ugh! I have to beat this stupid song to get to the next  rude!' => bollywood
'@BrodyJenner if u watch the hills

# Get Result

In [90]:
import os
import time
import sys


sys.stdout.write("%s" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1))

dirName = 'results'
 
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

## predictions for first 10 test samples
text_hollywood = ''
text_bollywood = ''
text_ml = ''
text_football = ''
text_bigg_boss = ''
text_food = '' 
text_hp_day = ''
text_politics = ''
text_mobiles = ''
text_cricket = ''
counter  = 0
for doc, category in zip(X_test, predicted):
    if id_to_tag_name[category] == "hollywood":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_hollywood != ""):
            text_hollywood = text_hollywood + '\n' + doc
        else:
            text_hollywood = doc
        f.write(text_hollywood)
        f.close()
    if id_to_tag_name[category] == "bollywood":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_bollywood != ""):
            text_bollywood = text_bollywood + '\n' + doc
        else:
            text_bollywood = doc
        f.write(text_bollywood)
        f.close()
    if id_to_tag_name[category] == "machine learning":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_ml != ""):
            text_ml = text_ml + '\n' + doc
        else:
            text_ml = doc
        f.write(text_ml)
        f.close()
    if id_to_tag_name[category] == "football":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_football != ""):
            text_football = text_football + '\n' + doc
        else:
            text_football = doc
        f.write(text_football)
        f.close()
    if id_to_tag_name[category] == "bigg boss":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_bigg_boss != ""):
            text_bigg_boss = text_bigg_boss + '\n' + doc
        else:
            text_bigg_boss = doc
        f.write(text_bigg_boss)
        f.close()
    if id_to_tag_name[category] == "food":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_food != ""):
            text_food = text_food + '\n' + doc
        else:
            text_food = doc
        f.write(text_food)
        f.close()
    if id_to_tag_name[category] == "happy birthday":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_hp_day != ""):
            text_hp_day = text_hp_day + '\n' + doc
        else:
            text_hp_day = doc
        f.write(text_hp_day)
        f.close() 
    if id_to_tag_name[category] == "politics":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_politics != ""):
            text_politics = text_politics + '\n' + doc
        else:
            text_politics = doc
        f.write(text_politics)
        f.close() 
    if id_to_tag_name[category] == "mobiles":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_mobiles != ""):
            text_mobiles = text_mobiles + '\n' + doc
        else:
            text_mobiles = doc
        f.write(text_mobiles)
        f.close()
    if id_to_tag_name[category] == "cricket":
        f= open('results/'+id_to_tag_name[category]+".txt","w+")
        if (text_cricket != ""):
            text_cricket = text_cricket + '\n' + doc
        else:
            text_cricket = doc
        f.write(text_cricket)
        f.close()



                                       Directory  results  Created 


In [87]:
np.mean(predicted == y_test) ## 80% sounds good only


0.9459181045583311

In [79]:
import time
import sys

toolbar_width = 40

# setup toolbar
sys.stdout.write("[%s]" % ("" * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['

for i in range(toolbar_width):
    time.sleep(0.1) # do real work here
    # update the bar
    sys.stdout.write("-")
    sys.stdout.flush()

sys.stdout.write(">Done]\n") # this ends the progress bar

[---------------------------------------->Done]
