### Importing the libraries

In [15]:
import numpy as np
import pandas as pd
import sklearn 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import re
import string
import re
from  sklearn.feature_extraction.text import TfidfVectorizer
np.random.seed(69420)


### Loading the dataset

In [16]:
train_link = "https://drive.google.com/file/d/199uKHh1aPSl46XVX_gCVgXasovlOsZ3T/view?usp=sharing"
test_link = "https://drive.google.com/file/d/1LWR1cxvTM-BKkYyE0Gb0XrXRS59BPoco/view?usp=sharing"

train_id= train_link.split('/')[-2]
test_id = test_link.split('/')[-2]

#Prefix url
start_url ='https://drive.google.com/uc?id=' 

#The dataframe
df_train = pd.read_csv(start_url + train_id ,encoding="latin-1")
df_test = pd.read_csv(start_url + test_id, encoding="latin-1")

### Tokeinizing and removing the stopwords, punctuations and Special symbols

In [17]:
## Getting a list of english stopwords
stopwords = [a.lower() for a in nltk.corpus.stopwords.words('english')]

def process_sentence(str):
    words = word_tokenize(str)
    str = [word.lower() for word in words if word.isalpha()]
    words = [re.sub(r'\S+@\S+', ' ', word) for word in str] 
    words = [word for word in words if word!=' ']

    # Removing the words that are present in the stopwords
    words=[word.lower() for word in words if word.lower() not in stopwords and word.strip() != ""]
    return ' '.join(words)

df_train['modified_sentences'] = df_train.Sentence.apply(process_sentence)
df_test['modified_sentences'] = df_test.Sentence.apply(process_sentence)

df_train

Unnamed: 0,Sentence,Sentiment,modified_sentences
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi lows bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,last quarter componenta net sales doubled peri...
3,According to the Finnish-Russian Chamber of Co...,neutral,according chamber commerce major construction ...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...
...,...,...,...
4860,3-star analyst Joe Wittine from Longbow Resear...,positive,analyst joe wittine longbow research reiterate...
4861,Our standardised services have met with a posi...,positive,standardised services met positive reception a...
4862,The Kyroskoski investment is to be completed i...,neutral,kyroskoski investment completed late investmen...
4863,The Group 's cash flow from operations will be...,positive,group cash flow operations positive


### Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(str):
    words = [lemmatizer.lemmatize(word) for word in str.split()]
    return ' '.join(words)
    
df_train['modified_sentences'] = df_train.modified_sentences.apply(lemmatize_words)
df_test['modified_sentences'] = df_test.modified_sentences.apply(lemmatize_words)


### Vectorising the Words

In [19]:
sentences = df_train.modified_sentences
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(sentences)
vector = vectorizer.transform(sentences)
X =(vector.toarray())
y = df_train.Sentiment

TRAINING THE MODEL AND PREDICTING THE SENTIMENTS

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = MultinomialNB().fit(X_train, y_train)
predictions= model.predict(X_test)

## Calculating the f1_score
print(f1_score(y_test,predictions,average=None))

[0.06167401 0.78129713 0.52840909]


Making the csv file for the predictions obtained

In [21]:
def predict(str):
    a=vectorizer.transform(str)
    return (model.predict(a))

predictions_actual_test = predict(df_test.modified_sentences)

df_pred= pd.DataFrame(data={"predictions":predictions_actual_test})
df_pred.to_csv('Predictions_sentiments.csv')