In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('https://www.kaggle.com/datasets/kazanova/sentiment140', encoding='latin-1', header=None)

# Rename the columns
data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Remove unnecessary columns
data.drop(['id', 'date', 'flag', 'user'], axis=1, inplace=True)

# Preview the data
print(data.head())


   target                                               text
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       0  is upset that he can't update his Facebook by ...
2       0  @Kenichan I dived many times for the ball. Man...
3       0    my whole body feels itchy and like its on fire 
4       0  @nationwideclass no, it's not behaving at all....


In [3]:
import nltk
nltk.download('corpora')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Error loading corpora: Package 'corpora' not found in
[nltk_data]     index
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/espersonnel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/espersonnel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens
    text = ' '.join(tokens)
    
    return text

# Apply the clean_text function to the 'text' column of the dataframe
data['text'] = data['text'].apply(clean_text)

# Preview the preprocessed data
print(data.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/espersonnel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/espersonnel/nltk_data...


   target                                               text
0       0  switchfoot http twitpic com zl awww bummer sho...
1       0  upset update facebook texting might cry result...
2       0  kenichan dived many time ball managed save res...
3       0                    whole body feel itchy like fire
4       0                   nationwideclass behaving mad see


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the preprocessed text
vectorizer.fit(data['text'])

# Transform the preprocessed text into a bag of words
X = vectorizer.transform(data['text'])

# Target variable
y = data['target']

# Print the shape of the feature matrix and the target variable
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1600000, 573643)
Shape of y: (1600000,)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Sample Prediction for the sentiment of a tweet
tweet = "I love this movie!"
tweet = clean_text(tweet)
tweet = vectorizer.transform([tweet])
sentiment = model.predict(tweet)[0]

# Print the sentiment
if sentiment == 0:
    print("Negative")
else:
    print("Positive")


Positive


In [7]:
# Tweet example to test the model
tweet = "Why would you do that? You are so stupid!"
tweet = clean_text(tweet)
tweet = vectorizer.transform([tweet])
sentiment = model.predict(tweet)[0]

# Print the sentiment
if sentiment == 0:
    print("Negative")
else:
    print("Positive")

Negative


In [8]:
import joblib

# Save the model to a file
joblib.dump(model, 'model.joblib')

# Save the vectorizer to a file
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']