# preparing libraries

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
stop_words = set(stopwords.words('english'))

#Dummy text

In [7]:
txt = "Welcome to the domain NLP.let us do the implementation  in python. nlp is simple, powerful and fantastic"

# Tokenization

In [8]:
# sent_tokenize is one of instances of 
# PunktSentenceTokenizer from the nltk.tokenize.punkt module
  
tokenized = sent_tokenize(txt)
print(tokenized)
for i in tokenized:
      
    # Word tokenizers is used to find the words 
    # and punctuation in a string
    wordsList = nltk.word_tokenize(i)
  
    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words] 
  
    #  Using a Tagger. Which is part-of-speech 
    # tagger or POS-tagger. 
    tagged = nltk.pos_tag(wordsList)
  
    print(tagged)

['Welcome to the domain NLP.let us do the implementation  in python.', 'nlp is simple, powerful and fantastic']
[('Welcome', 'NNP'), ('domain', 'NN'), ('NLP.let', 'NNP'), ('us', 'PRP'), ('implementation', 'NN'), ('python', 'NN'), ('.', '.')]
[('nlp', 'RB'), ('simple', 'JJ'), (',', ','), ('powerful', 'JJ'), ('fantastic', 'JJ')]


# Ngram model

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use(style='seaborn')

#get the data from https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news/version/5

colnames=['Sentiment', 'news']

df=pd.read_csv('all-data.csv',encoding = "ISO-8859-1", names=colnames, header = None)
df.head()


Unnamed: 0,Sentiment,news
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  4846 non-null   object
 1   news       4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [13]:
df['Sentiment'].value_counts()
y=df['Sentiment'].values
y.shape


(4846,)

In [14]:
x=df['news'].values
x.shape

(4846,)

In [15]:
from sklearn.model_selection import train_test_split
(x_train,x_test,y_train,y_test)=train_test_split(x,y,test_size=0.4)
x_train.shape
y_train.shape
x_test.shape
y_test.shape

(1939,)

In [16]:
df1=pd.DataFrame(x_train)
df1=df1.rename(columns={0:'news'})
df2=pd.DataFrame(y_train)
df2=df2.rename(columns={0:'sentiment'})
df_train=pd.concat([df1,df2],axis=1)
df_train.head()

Unnamed: 0,news,sentiment
0,"Look out for vintage fabric cushion covers , '...",neutral
1,The cooperation will involve Arena Partners bu...,neutral
2,The first phase of the logistics complex envis...,neutral
3,"The contractor of the shopping center , China ...",neutral
4,"Published by Globes online , Israel business n...",neutral


In [17]:
df3=pd.DataFrame(x_test)
df3=df3.rename(columns={0:'news'})
df4=pd.DataFrame(y_test)
df4=df2.rename(columns={0:'sentiment'})
df_test=pd.concat([df3,df4],axis=1)
df_test.head()

Unnamed: 0,news,sentiment
0,"Net interest income was EUR 39.3 mn , up from ...",neutral
1,Its board of directors will propose a dividend...,neutral
2,The company 's net sales in 2009 totalled MEUR...,neutral
3,Laavainen said Benecol was well known in Europ...,neutral
4,"( ADP News ) - Nov 5 , 2008 - Finnish electron...",neutral


#Removing punctuations

In [18]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
#defining the function to remove punctuation
def remove_punctuation(text):
  if(type(text)==float):
    return text
  ans=""  
  for i in text:     
    if i not in string.punctuation:
      ans+=i    
  return ans

# Storing the puntuation free text in a new column called clean_msg

In [20]:
df_train['news']= df_train['news'].apply(lambda x:remove_punctuation(x))
df_test['news']= df_test['news'].apply(lambda x:remove_punctuation(x))
df_train.head()
#punctuations are removed from news column in train dataset

Unnamed: 0,news,sentiment
0,Look out for vintage fabric cushion covers 70...,neutral
1,The cooperation will involve Arena Partners bu...,neutral
2,The first phase of the logistics complex envis...,neutral
3,The contractor of the shopping center China S...,neutral
4,Published by Globes online Israel business ne...,neutral


In [21]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Method to generate n-grams:

In [24]:
#params:
#text-the text for which we have to generate n-grams
#ngram-number of grams to be generated from the text(1,2,3,4 etc., default value=1)
def generate_N_grams(text,ngram=1):
  words=[word for word in text.split(" ") if word not in set(stopwords.words('english'))]  
  print("Sentence after removing stopwords:",words)
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

In [26]:
generate_N_grams("The sun rises in the east",2)

Sentence after removing stopwords: ['The', 'sun', 'rises', 'east']


['The sun', 'sun rises', 'rises east']

In [27]:
generate_N_grams("The sun rises in the east",3)

Sentence after removing stopwords: ['The', 'sun', 'rises', 'east']


['The sun rises', 'sun rises east']

In [28]:
generate_N_grams("The sun rises in the east",4)

Sentence after removing stopwords: ['The', 'sun', 'rises', 'east']


['The sun rises east']