In [31]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle
import pandas as pd

In [32]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
def clean_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()

    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)

    # Remove special characters and numbers
    tweet = re.sub(r'\W+', ' ', tweet)

    # Tokenize the tweet
    tokens = word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a sentence
    clean_tweet = ' '.join(tokens)

    return clean_tweet

In [34]:
column_names = ['id', 'empresa', 'sentimento', 'tweet']
dataset = pd.read_csv('twitter_training.csv', sep=',', names=column_names)

In [35]:
dataset.dropna(inplace=True)
dataset.drop_duplicates(inplace=True)

In [36]:
dataset['Clean'] = dataset['tweet'].apply(clean_tweet)

In [37]:
X = dataset['Clean']
y = dataset['sentimento']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
vectorizer = CountVectorizer() # It converts the preprocessed text data into a matrix of word frequencies
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [39]:
# Training The Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [40]:
# Save the model and vectorizer
with open('my_model.pickle', 'wb') as file:
    pickle.dump(model, file)

In [41]:
with open('my_vectorizer.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)

In [42]:
# Evaluating the model
accuracy = model.score(X_test_vec, y_test)
print(f'Accuracy of the trained model: {accuracy * 100:.2f}%')

Accuracy of the trained model: 73.77%


In [43]:
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
try:
    model_object = pickle.load(open('my_model.pickle', 'rb'))

    vectorizer_object = pickle.load(open('my_vectorizer.pickle', 'rb'))
    input_text = input("Enter something: ")
    input_text= input_text.lower()
    print(input_text)
    modified_text= word_tokenize(input_text)
    print(modified_text)
    modified_text= vectorizer_object.transform(modified_text)
    sentiment= model_object.predict(modified_text)
    sentiment= list(sentiment)
    print("Sentiment :", max(set(sentiment), key=sentiment.count))
except:
    pass