In [1]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import random
from PIL import Image
import matplotlib.pyplot as plt
from autocorrect import spell

In [2]:
%matplotlib inline
pd.options.display.max_colwidth = -1 #does not truncate text

## Data Preprocessing

In [3]:
data_dir = 'C:/Users/Brajesh Ranjan/Desktop/Data Science/Datasets/Text Mining/'
file_name = 'demo.csv'

In [4]:
input_df = pd.read_csv(data_dir + file_name,encoding = 'latin1')

In [5]:
tweets_df = input_df[['text']]
tweets_df.columns = ['tweets']

In [6]:
tweets_df.head()

Unnamed: 0,tweets
0,RT @rssurjewala: Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;
1,RT @Hemant_80: Did you vote on #Demonetization on Modi survey app?
2,"RT @roshankar: Former FinSec, RBI Dy Governor, CBDT Chair + Harvard Professor lambaste #Demonetization.\r\n\r\nIf not for Aam Aadmi, listen to th"
3,RT @ANI_news: Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization https://t.co/uGMxUP9
4,RT @satishacharya: Reddy Wedding! @mail_today cartoon #demonetization #ReddyWedding https://t.co/u7gLNrq31F


## Remove URLs, RTs and Twitter Handles

In [7]:
def remove_urls(input_text):
    tokens = input_text.split()
    clean_tokens = [word for word in tokens
                   if 'http' not in word and
                   '@' not in word and 
                   '<' not in word and 
                   'RT' not in word]
    return ' '.join(clean_tokens)

string1 = "I Love Python"
string_lst = string1.split()
'_'.join string_lst

In [8]:
tweets_df['tweets'] = tweets_df['tweets'].apply(remove_urls)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
tweets_df.head()

Unnamed: 0,tweets
0,Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;
1,Did you vote on #Demonetization on Modi survey app?
2,"Former FinSec, RBI Dy Governor, CBDT Chair + Harvard Professor lambaste #Demonetization. If not for Aam Aadmi, listen to th"
3,Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization
4,Reddy Wedding! cartoon #demonetization #ReddyWedding


## Convert Case

In [10]:
def convert_case(input_text):
    split = input_text.split(' ')
    lower = [text.lower() for text in split] 
    output = ' '.join(lower)
    return output

In [11]:
tweets_df['tweets'] = tweets_df['tweets'].apply(convert_case)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
tweets_df.head()

Unnamed: 0,tweets
0,critical question: was paytm informed about #demonetization edict by pm? it's clearly fishy and requires full disclosure &amp;
1,did you vote on #demonetization on modi survey app?
2,"former finsec, rbi dy governor, cbdt chair + harvard professor lambaste #demonetization. if not for aam aadmi, listen to th"
3,gurugram (haryana): post office employees provide cash exchange to patients in hospitals #demonetization
4,reddy wedding! cartoon #demonetization #reddywedding


## Clear Punctuation

In [13]:
tweets_df['tweets'] = tweets_df['tweets'].str.replace('[^\w\s]','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
tweets_df.head()

Unnamed: 0,tweets
0,critical question was paytm informed about demonetization edict by pm its clearly fishy and requires full disclosure amp
1,did you vote on demonetization on modi survey app
2,former finsec rbi dy governor cbdt chair harvard professor lambaste demonetization if not for aam aadmi listen to th
3,gurugram haryana post office employees provide cash exchange to patients in hospitals demonetization
4,reddy wedding cartoon demonetization reddywedding


## Remove Stopword

In [15]:
def remove_stop_words(input_text):
    stopword_list = set(stopwords.words('english'))
    stopword_list.add("https")
    stopword_list.add("00A0")
    stopword_list.add("00BD")
    stopword_list.add("00B8")
    stopword_list.add("ed")
    stopword_list.add("demonetization")
    stopword_list.add("Demonetization co")
    stopword_list.add("narendramodi")
    stopword_list.add("modi")
    stopword_list.add("lakh")
    stopword_list.add("amp")
    stopword_list.add("jampk")
    stopword_list.add("narendra")
    stopword_list.add("thats")
    stopword_list.add("ads")
    stopword_list.add("8086")
    stopword_list.add("youtube")
    stopword_list.add("dear")
    tokens = input_text.split()
    stop_words_removed = [token for token in tokens if token not in stopword_list]
    return ' '.join(stop_words_removed)

In [16]:
#import nltk
#nltk.download('stopwords')

In [17]:
tweets_df['tweets'] = tweets_df['tweets'].apply(remove_stop_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
tweets_df.head()

Unnamed: 0,tweets
0,critical question paytm informed edict pm clearly fishy requires full disclosure
1,vote survey app
2,former finsec rbi dy governor cbdt chair harvard professor lambaste aam aadmi listen th
3,gurugram haryana post office employees provide cash exchange patients hospitals
4,reddy wedding cartoon reddywedding


## Keep Alpha Numerics

In [None]:
#tweets_df['tweets'].apply (lambda x : re.sub('[A-Za-z0-9]',' ,x')).head(10)

In [19]:
tweets_df['tweets'] = tweets_df['tweets'].apply(lambda x : re.sub('[^A-Za-z0-9]',' ', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
tweets_df.head()

Unnamed: 0,tweets
0,critical question paytm informed edict pm clearly fishy requires full disclosure
1,vote survey app
2,former finsec rbi dy governor cbdt chair harvard professor lambaste aam aadmi listen th
3,gurugram haryana post office employees provide cash exchange patients hospitals
4,reddy wedding cartoon reddywedding


## Remove Words of smaller length

In [21]:
def remove_small_words(input_text):
    tokens = input_text.split()
    cleaned_tokens = [token for token in tokens if len(token) >= 3]
    return ' '.join(cleaned_tokens)

In [22]:
tweets_df['tweets'] = tweets_df['tweets'].apply(remove_small_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
tweets_df.head()

Unnamed: 0,tweets
0,critical question paytm informed edict clearly fishy requires full disclosure
1,vote survey app
2,former finsec rbi governor cbdt chair harvard professor lambaste aam aadmi listen
3,gurugram haryana post office employees provide cash exchange patients hospitals
4,reddy wedding cartoon reddywedding


## Lammatization(convert the word to its root word)

In [24]:
import nltk

In [25]:
lemmatizer = WordNetLemmatizer()

def lemmatize(input_text):
    tokens = input_text.split()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmas)

tweets_df['lemmas'] = tweets_df['tweets'].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
tweets_df[['lemmas']].head(10)

Unnamed: 0,lemmas
0,critical question paytm informed edict clearly fishy requires full disclosure
1,vote survey app
2,former finsec rbi governor cbdt chair harvard professor lambaste aam aadmi listen
3,gurugram haryana post office employee provide cash exchange patient hospital
4,reddy wedding cartoon reddywedding
5,india blackmoney symptom disease via
6,looted bank kishtwar third incident since terrorist
7,calling nationalist join walk corruptionfreeindia spread benefit among ma
8,many opposition leader respect decisionbut support oppositio
9,national reform destroyed even essence sagan instance urge giving second though


## Term Frequency

In [None]:
tf = tweets_df['tweets'].apply(lambda x: pd.value_counts(x.split(" ")))\
.sum(axis = 0)\
.reset_index()

In [None]:
tf.columns = ['words','tf']

In [None]:
tf.head()

In [None]:
for i,word in enumerate(tf['words']):
    tf.loc[i,'idf'] = np.log(tweets_df.shape[0]/(len(tweets_df[tweets_df['tweets'].str.contains(word)])))

In [None]:
tf['tf_idf'] = tf['tf']*tf['idf'] #TF = TF*IDF.

In [None]:
top_words = tf.sort_values('tf_idf',ascending = False)[0:200]

In [None]:
top_words.head()

In [None]:
#Creating Wordmap
words = list(top_words['words'])
score = list(top_words['tf_idf'])
top_tuples = list(zip(words,score))

d = {} #creating empty dictionary

for word,score in top_tuples:
    d[word] = score

wc = WordCloud(background_color = 'white',max_font_size = 200,height = 2000,width=2000,colormap='Spectral')
wc.fit_words(d)

In [None]:
%matplotlib notebook
plt.imshow(wc,interpolation='bilinear')
plt.axis("off")
wc.to_file("first_review.png")

In [None]:
from textblob import TextBlob

In [None]:
import nltk
nltk.download('brown')

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def get_noun_phrases(input_text):
    blob = TextBlob(input_text)
    np = list(blob.noun_phrases)
    return np

tweets_df['noun_phrases'] = tweets_df['lemmas'].apply(get_noun_phrases)

In [None]:
tweets_df['len'] = tweets_df['noun_phrases'].apply(lambda x : len(x))

In [None]:
tweets_clean = tweets_df[tweets_df['len'] != 0]

In [None]:
tweets_clean.head()

In [None]:
tweets_clean['np'] = tweets_clean['noun_phrases'].apply(lambda x : ' '.join(x))

In [None]:
import nltk
nltk.download ("all")

In [None]:
import nltk
nltk.download ("all")

## 1 Dec Class Starts

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
vectorizer = TfidfVectorizer(ngram_range = (2,2)) #Unigram creation

In [39]:
tdm = vectorizer.fit_transform(tweets_df['lemmas']) #term document matrix

In [40]:
tdm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
freqs = [(word, tdm.getcol(idx).sum()) for word, idx
        in vectorizer.vocabulary_.items()]
d = {}
for word, score in freqs:
    d[word] = score

In [42]:
d

{'critical question': 95.44887187934621,
 'question paytm': 95.44887187934621,
 'paytm informed': 95.44887187934621,
 'informed edict': 95.44887187934621,
 'edict clearly': 95.2660700453711,
 'clearly fishy': 94.53775943078799,
 'fishy requires': 94.07898430624532,
 'requires full': 94.07898430624532,
 'full disclosure': 93.34074863834168,
 'vote survey': 46.06766520276945,
 'survey app': 48.18417512959298,
 'former finsec': 3.656783952710927,
 'finsec rbi': 3.656783952710927,
 'rbi governor': 5.691525602014475,
 'governor cbdt': 3.656783952710927,
 'cbdt chair': 3.656783952710927,
 'chair harvard': 3.656783952710927,
 'harvard professor': 3.656783952710927,
 'professor lambaste': 3.656783952710927,
 'lambaste aam': 3.656783952710927,
 'aam aadmi': 6.17246731981562,
 'aadmi listen': 3.656783952710927,
 'gurugram haryana': 0.6711232415153479,
 'haryana post': 0.6711232415153479,
 'post office': 0.8843440604024709,
 'office employee': 0.6711232415153479,
 'employee provide': 0.6711232415

In [36]:
w = WordCloud(background_color = 'White',
             max_words=200,
             max_font_size=200,height=2000,
             width=2000,colormap='Spectral').fit_words(d)

In [37]:
w.to_file('tfdsklearn_wc.png')

<wordcloud.wordcloud.WordCloud at 0x114fb99fb38>