In [14]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
# Import necessary libraries
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import string

In [16]:
# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

In [17]:
# Load the dataset (ensure the path is correct)
file_path = '/content/tweets.csv'
df = pd.read_csv(file_path)

In [18]:
# Rename 'label' to 'sentiment' (if necessary)
df.rename(columns={'label': 'sentiment'}, inplace=True)

In [19]:
# Inspect the columns
print("Columns in the dataset:", df.columns)

Columns in the dataset: Index(['id', 'sentiment', 'tweet'], dtype='object')


In [20]:
# Preprocessing function using spaCy
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-word characters (keep only alphabets and spaces)
    text = re.sub(r'\W+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Use spaCy for tokenization and stopword removal
    doc = nlp(text.lower())
    # Remove stopwords and return the processed text
    words = [token.text for token in doc if not token.is_stop]
    return ' '.join(words)

In [21]:
# Apply preprocessing to the tweet column
df['tweet'] = df['tweet'].apply(preprocess_text)

In [22]:
# Split the data into train and test sets
X = df['tweet']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [24]:
# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [25]:
# Predict on the test set
y_pred = classifier.predict(X_test_vectorized)

In [26]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 90.28%
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      1152
           1       0.81      0.84      0.82       432

    accuracy                           0.90      1584
   macro avg       0.88      0.88      0.88      1584
weighted avg       0.90      0.90      0.90      1584

