# TFIDF + multinomial bayes

In [63]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# import CountVectorizer, nltk
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import PorterStemmer

In [64]:
# loading all files. 

data = pd.read_csv("all-data.csv", encoding='unicode_escape',names=['Sentiment', 'Text'])
data['Sentiment']

# Filter out rows where Sentiment is 'neutral'
data = data[data['Sentiment'] != 'negative']
# Split data into training and test sets
sentence_train, sentence_test, y_train, y_test =  train_test_split(data["Text"], data["Sentiment"], test_size = 0.20, shuffle = False)
print(sentence_train)
print("-------------")
print(y_train)


0       According to Gran , the company has no plans t...
1       Technopolis plans to develop in stages an area...
3       With the new production plant the company woul...
4       According to the company 's updated strategy f...
5       FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
                              ...                        
3489    `` We have come out with a decision which is b...
3490    1 April 2011 - Finnish machinery rental compan...
3492    7 March 2011 - Finnish IT company Digia Oyj HE...
3493    8,600 m , and at the time of investment it is ...
3494    A filter is used to pre-process packets to det...
Name: Text, Length: 3393, dtype: object
-------------
0        neutral
1        neutral
3       positive
4       positive
5       positive
          ...   
3489     neutral
3490     neutral
3492    positive
3493     neutral
3494     neutral
Name: Sentiment, Length: 3393, dtype: object


In [65]:
# Import NLTK for stop words
import nltk
from nltk.corpus import stopwords

In [66]:
# Download NLTK stop words (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
print(stop_words);

{"you'd", 'aren', 'we', 'there', 'y', 'on', 'o', 'other', 'wasn', 'from', 'he', 'through', 'with', 'm', 'during', 'not', 'its', 'than', 't', 'by', 'ours', 'mustn', 'were', 'yourselves', 'of', 'doesn', 'hasn', 'same', 'and', 'any', 'then', 'they', 'the', "mightn't", 'very', 'me', 'who', 'this', 'into', 'when', 'all', "wouldn't", 'no', 'shan', 'between', 'will', 'weren', 'most', "you'll", 'that', 'yours', 'been', 'himself', 'a', "weren't", 'ain', 'she', 'these', "haven't", 'more', 'isn', 'needn', "isn't", "needn't", "that'll", 'here', 'in', 're', 'be', 'why', "couldn't", 'how', 'has', 'do', 'him', 'being', 'can', 'did', 'whom', 'just', "aren't", 'are', "it's", "didn't", 'your', 'his', "hadn't", 'about', 'nor', "you've", 'while', 'haven', "shan't", 'my', 'is', 'as', 'only', 'theirs', 's', 'for', 'don', 'before', 'each', 'it', 'up', "mustn't", 'am', 'herself', 'll', 'because', 'at', 'to', 'under', 'now', 'until', 'our', 'down', 'what', 'once', 'should', 'you', 'those', 'both', 'own', "you'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rebecka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rebecka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
# Split data into training and test sets
sentence_train, sentence_test, y_train, y_test =  train_test_split(data["Text"], data["Sentiment"], test_size=0.20, shuffle=False)

In [68]:
# Function to remove stop words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply stop words removal to the text data
sentence_train = sentence_train.apply(remove_stop_words)
sentence_test = sentence_test.apply(remove_stop_words)
print(sentence_train);
print(sentence_test);
# Define the pipeline
#text_clf = Pipeline([
 #   ('vect', CountVectorizer(min_df=2)),
  #  ('tfidf', TfidfTransformer()),
   # ('clf', MultinomialNB())
#])

0       According Gran , company plans move production...
1       Technopolis plans develop stages area less 100...
3       new production plant company would increase ca...
4       According company 's updated strategy years 20...
5       FINANCING ASPOCOMP 'S GROWTH Aspocomp aggressi...
                              ...                        
3489    `` come decision based preliminary economic , ...
3490    1 April 2011 - Finnish machinery rental compan...
3492    7 March 2011 - Finnish company Digia Oyj HEL :...
3493    8,600 , time investment fully leased several t...
3494    filter used pre-process packets determine need...
Name: Text, Length: 3393, dtype: object
3495                                profit still target .
3496    spokeswoman Italian fashion house declined com...
3497    total EUR73 .7 provided secured senior three-y...
3499    total 15,000 new Citycon shares nominal value ...
3500    total six polled analysts rated M-real -- two ...
                              ..

In [69]:
# Create a PorterStemmer instance
porter_stemmer = PorterStemmer()

# Function to apply stemming and remove stop words
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    stemmed_words = [porter_stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(stemmed_words)

# Apply stemming and stop words removal to the text data
sentence_train = sentence_train.apply(preprocess_text)
sentence_test = sentence_test.apply(preprocess_text)

In [70]:
# initialize CountVectorizer
#movieVzer= CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=3000) # use top 3000 words only. 78.25% acc.
countVectorizer = CountVectorizer(min_df=2)         # use all 25K words. Higher accuracy

# fit and tranform using training text 
sentence_train_counts = countVectorizer.fit_transform(sentence_train)

# Convert raw frequency counts into TF-IDF values
tfidfTransformer = TfidfTransformer()
sentence_train_tfidf = tfidfTransformer.fit_transform(sentence_train_counts)

# Using the fitted vectorizer and transformer, tranform the test data
sentence_test_counts = countVectorizer.transform(sentence_test)
sentence_test_tfidf = tfidfTransformer.transform(sentence_test_counts)

In [71]:
# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Train a Multimoda Naive Bayes classifier. Again, we call it "fitting"
clf = MultinomialNB()
clf.fit(sentence_train_tfidf, y_train)

clfSVM = SVC(kernel='linear', C=1.0)
clfSVM.fit(sentence_train, y_train)



In [72]:
predicted = clf.predict(sentence_test_tfidf)
print(np.mean(predicted == y_test))

predicted = clfSVM.predict(sentence_test_tfidf)
print(np.mean(predicted == y_test))

0.8091872791519434
