In [5]:
import nltk
from nltk.corpus import twitter_samples

nltk.download('twitter_samples')

all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')
print('Number of positive tweets: ',len(all_positive_tweets))
print('Number of negative tweets: ',len(all_negative_tweets))

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


Number of positive tweets:  5000
Number of negative tweets:  5000


In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
#
nltk.download('punkt_tab')
#
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Data cleaning
def cleaned_tweet_with_lemmatization(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()

    # Tokenize the tweet (now just tokenizing for lemmatization, no stemming)
    tokens = nltk.word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Perform lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the lemmatized tokens back into a single string
    return ' '.join(lemmatized_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
cleaned_positive_tweets = [cleaned_tweet_with_lemmatization(tweet) for tweet in all_positive_tweets]
cleaned_negative_tweets = [cleaned_tweet_with_lemmatization(tweet) for tweet in all_negative_tweets]

In [8]:
# Combining the tweets
tweets = all_positive_tweets + all_negative_tweets
# Creating labels for positive 1 and for negatives 0
labels = [1] * len(cleaned_positive_tweets) + [0] * len(cleaned_negative_tweets)
# Combining the cleaned tweets
cleaned_tweets = [cleaned_tweet_with_lemmatization(tweet) for tweet in tweets]



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Now we apply Tfi-Idf
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(cleaned_tweets)
print("TF-IDF Shape:", X.shape)

TF-IDF Shape: (10000, 5000)


In [10]:
# Now we split the data in train set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


svm_model = SVC(kernel='linear', random_state=42)


svm_model.fit(X_train, y_train)

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

# Training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Detailed classification report for training data
print("\nTraining Data Metrics:")
print(classification_report(y_train, y_train_pred))

# Detailed classification report for testing data
print("\nTesting Data Metrics:")
print(classification_report(y_test, y_test_pred))




Training Accuracy: 0.89
Testing Accuracy: 0.76

Training Data Metrics:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4012
           1       0.91      0.86      0.88      3988

    accuracy                           0.89      8000
   macro avg       0.89      0.89      0.89      8000
weighted avg       0.89      0.89      0.89      8000


Testing Data Metrics:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       988
           1       0.78      0.73      0.75      1012

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000

