In [48]:
# Import Libraries
import pandas as pd
import spacy
import nltk
import numpy as np

In [49]:
# Sentence Tokenizer
text="Today is a great day. It is even better than yesterday. And yesterday was the best day ever."
import nltk
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

['Today is a great day.',
 'It is even better than yesterday.',
 'And yesterday was the best day ever.']

In [2]:
# Word Tokenizer
nltk.word_tokenize(text)

['Today',
 'is',
 'a',
 'great',
 'day',
 '.',
 'It',
 'is',
 'even',
 'better',
 'than',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'was',
 'the',
 'best',
 'day',
 'ever',
 '.']

In [6]:
# Synonyms
from nltk.corpus import wordnet
syn=wordnet.synsets('dog')
syn


[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [14]:
# Antonyms
from nltk.corpus import wordnet
antonyms=[]
for syn in wordnet.synsets('depressed'):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms


['elate']


In [11]:
# Stemming
from stemming.porter2 import stem
stem("casually")

'casual'

In [19]:
# Lemmatization
import en_core_web_sm
nlp = en_core_web_sm.load()
doc="good better best"
for token in nlp(doc):
    print(token,token.lemma_)

good good
better better
best good


In [7]:
# Word Embeddings

In [1]:
# Part-Of-Speech
import en_core_web_sm
nlp = en_core_web_sm.load()
sentence="Ashok killed the snake with a stick"
for token in nlp(sentence):
   print(token,token.pos_) 

Ashok PROPN
killed VERB
the DET
snake NOUN
with ADP
a DET
stick NOUN


In [4]:
# Named Entity Recognition
import spacy
nlp = en_core_web_sm.load()
sentence="Ram of Apple Inc. travelled to Sydney on 5th October 2017"
for token in nlp(sentence):
   print(token, token.ent_type_) 

Ram 
of 
Apple ORG
Inc. ORG
travelled 
to 
Sydney GPE
on 
5th ORDINAL
October DATE
2017 DATE


In [8]:
# Sentiment Analysis

In [6]:
# Text Summarisation
from gensim.summarization import summarize

sentence="Automatic summarization is the process of shortening a text document with software, in order to create a summary with the major points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax.Automatic data summarization is part of machine learning and data mining.."

print(summarize(sentence))




In [9]:
# Semantic Text Similarity

In [10]:
# Language Identification

In [13]:
# Read the dataset
train = pd.read_csv('train_E6oV3lV.csv')

In [14]:
train.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [15]:
# Count the number of words
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [16]:
# Count the number of characters
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [17]:
# Average Word Lenth
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [28]:
# Number of Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
print("Stopwords:",stop)

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']].head()

Stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [19]:
# Number of special characters
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [20]:
# Number of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [29]:
# Number of uppercase words
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [34]:
# Number of lower words
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [35]:
# Removing punctuation
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [36]:
# Removal of Stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [37]:
# Common words removal
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
print("freq: ", freq)

freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()


freq:  user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64


0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [39]:
# Rare words removal
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
print("freq: ", freq)

freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

freq:  writerâ           1
enforcement       1
allofthis         1
motioncapture     1
thingâï           1
enjoythedinner    1
1430              1
pita              1
springer          1
brillenarmy       1
dtype: int64


0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [40]:
# Spelling Correction
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [41]:
# Tokenization
TextBlob(train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [42]:
# Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [43]:
# Lemmatization
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [44]:
# N-grams
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [45]:
# Term Frequency
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,lyft,1
1,cant,1
2,thanks,1
3,getthanked,1
4,disapointed,1
5,cause,1
6,offer,1
7,credit,1
8,van,1
9,wheelchair,1


In [50]:
# Inverse Document Frequency
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,lyft,1,8.762865
1,cant,1,3.538194
2,thanks,1,4.597751
3,getthanked,1,9.679156
4,disapointed,1,10.372303
5,cause,1,5.690172
6,offer,1,6.522155
7,credit,1,7.327781
8,van,1,5.236505
9,wheelchair,1,9.273691


In [52]:
# TF - IDF
tf1['tfidf'] = tf1['tf'] * tf1['idf']
print("tf1: ",tf1)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])

train_vect

tf1:            words  tf        idf      tfidf
0          lyft   1   8.762865   8.762865
1          cant   1   3.538194   3.538194
2        thanks   1   4.597751   4.597751
3    getthanked   1   9.679156   9.679156
4   disapointed   1  10.372303  10.372303
5         cause   1   5.690172   5.690172
6         offer   1   6.522155   6.522155
7        credit   1   7.327781   7.327781
8           van   1   5.236505   5.236505
9    wheelchair   1   9.273691   9.273691
10          use   1   3.552287   3.552287
11         dont   1   3.745585   3.745585
12          pdx   1   8.762865   8.762865


<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 114043 stored elements in Compressed Sparse Row format>

In [53]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128380 stored elements in Compressed Sparse Row format>

In [55]:
# Sentiment Analysis
print(train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment))

train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['tweet','sentiment']].head()

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.0, 0.0)
4                    (0.0, 0.0)
Name: tweet, dtype: object


Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drag kid dysfunct...,-0.3
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0


In [None]:
# Word Embeddings

# Step 1 : Download pre-trained word vectors and then convert them into word2vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
print(glove2word2vec(glove_input_file, word2vec_output_file))

# Step 2 : Load word2vsc file as a model
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

print("model[go] :", model['go'])
print("model[away] :", model['away'])
print("(model['go'] + model['away'])/2 :",(model['go'] + model['away'])/2)

(400000, 100)
