<a href="https://colab.research.google.com/github/Anas-abahaj/Sentiment-analysis/blob/main/Sentiment_analysis_using_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None, on_bad_lines="skip")

# Assign column names
df.columns = ["target", "id", "date", "flag", "user", "text"]

# Select relevant columns
df = df[["target", "text"]]

# Map sentiment labels (Convert 0 -> negative, 4 -> positive)
df["target"] = df["target"].map({0: 0, 4: 1})  # 0=negative, 1=positive

# Reduce dataset size for faster processing
data = df.sample(frac=0.25, random_state=42)  # Take 25% of data

In [23]:
data.head()

Unnamed: 0,target,text
541200,0,@chrishasboobs AHHH I HOPE YOUR OK!!!
750,0,"@misstoriblack cool , i have no tweet apps fo..."
766711,0,@TiannaChaos i know just family drama. its la...
285055,0,School email won't open and I have geography ...
705995,0,upper airways problem


In [24]:
data.tail()

Unnamed: 0,target,text
41450,0,@brykins Splendid! I was told I looked like a ...
355871,0,@herbadmother I'm so sorry! that IS sad
1251663,1,@JosieStingray Sounds like Eddie Murphy is coo...
66109,0,http://twitpic.com/4incl - The tiny Porter pla...
1334209,1,"Im glad i got my old gameboy to work,now whene..."


In [25]:
print('lenght of data is', len(data))

lenght of data is 400000


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400000 entries, 541200 to 1334209
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  400000 non-null  int64 
 1   text    400000 non-null  object
dtypes: int64(1), object(1)
memory usage: 9.2+ MB


In [27]:
np.sum(data.isnull().any(axis=1))

0

In [28]:
import re
import string
import nltk
from nltk.corpus import stopwords

In [29]:
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [30]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{"they'd", 'yours', 'then', 'above', "we'd", "weren't", 'ours', 'now', 'the', "haven't", 'these', 'does', 'myself', 'can', "mustn't", "won't", "we'll", 'having', 'which', 'between', 'o', "they've", 'until', 'her', 'about', "hadn't", 'yourself', 'were', "you're", 'being', 'other', "they're", 'own', 'there', 'over', 'same', "he'd", 'such', 'had', "shan't", 'y', 'under', 'each', "he'll", 'they', 'haven', "you'd", 'couldn', 'into', 'those', 'where', "aren't", 'ma', 'mightn', 'shouldn', "doesn't", "mightn't", 'don', 'against', 'has', "you'll", 'me', "i'm", 'did', 'wouldn', 'will', 'him', "i've", "i'll", 'was', 'been', 'do', 'themselves', 'we', 'so', 'it', 'that', 'during', "i'd", 'any', 'by', 'off', 'hasn', 'if', 'shan', 'should', 've', 'here', "he's", 'i', 'ain', 'hers', 'isn', 'too', 'hadn', 'whom', "she'll", 'm', 'yourselves', 'some', "couldn't", 'few', 'to', 'or', "that'll", 'your', 'won', 'all', "it's", 'needn', "they'll", 'below', "isn't", 'with', 'before', 'she', 'both', 'of', 'itsel

In [31]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'@\w+', '', text)  # Remove mentions (@user)
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuations
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

data["cleaned_text"] = data["text"].apply(clean_text)
print(data["cleaned_text"].head())

541200                                         ahhh hope ok
750                                    cool tweet apps razr
766711    know family drama lamehey next time u hang kim...
285055    school email wont open geography stuff revise ...
705995                                upper airways problem
Name: cleaned_text, dtype: object


# Tokenization, Stemming, Lemmatization, POS Tagging, NER, and Word Vectors

**Using NLTK laibrary**

In [32]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk

In [33]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def process_nltk(text):
    tokens = word_tokenize(text)  # Tokenization
    stemmed = [stemmer.stem(word) for word in tokens]  # Stemming
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    pos_tags = pos_tag(tokens)  # POS Tagging
    ner_tree = ne_chunk(pos_tags)  # Named Entity Recognition (NER)

    named_entities = []
    for subtree in ner_tree:
        if hasattr(subtree, "label"):
            entity = " ".join([token for token, pos in subtree.leaves()])
            named_entities.append((entity, subtree.label()))

    return {
        "tokens": tokens,
        "stemmed": stemmed,
        "lemmatized": lemmatized,
        "pos_tags": pos_tags,
        "ner": named_entities
    }

# Apply function to a sample tweet
sample_text = data["cleaned_text"].iloc[2]
nltk_results = process_nltk(sample_text)

print("NLTK Tokenization:", nltk_results["tokens"])
print("NLTK Stemming:", nltk_results["stemmed"])
print("NLTK Lemmatization:", nltk_results["lemmatized"])
print("NLTK POS Tags:", nltk_results["pos_tags"])
print("NLTK Named Entities:", nltk_results["ner"])

NLTK Tokenization: ['know', 'family', 'drama', 'lamehey', 'next', 'time', 'u', 'hang', 'kim', 'n', 'u', 'guys', 'like', 'sleepover', 'whatever', 'ill', 'call', 'u']
NLTK Stemming: ['know', 'famili', 'drama', 'lamehey', 'next', 'time', 'u', 'hang', 'kim', 'n', 'u', 'guy', 'like', 'sleepov', 'whatev', 'ill', 'call', 'u']
NLTK Lemmatization: ['know', 'family', 'drama', 'lamehey', 'next', 'time', 'u', 'hang', 'kim', 'n', 'u', 'guy', 'like', 'sleepover', 'whatever', 'ill', 'call', 'u']
NLTK POS Tags: [('know', 'VB'), ('family', 'NN'), ('drama', 'NN'), ('lamehey', 'IN'), ('next', 'JJ'), ('time', 'NN'), ('u', 'JJ'), ('hang', 'NN'), ('kim', 'NN'), ('n', 'JJ'), ('u', 'JJ'), ('guys', 'NNS'), ('like', 'IN'), ('sleepover', 'NN'), ('whatever', 'WDT'), ('ill', 'RB'), ('call', 'VBP'), ('u', 'JJ')]
NLTK Named Entities: []


**Using SpaCy labrairy**

In [34]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Load small English model

def process_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]  # Tokenization
    lemmatized = [token.lemma_ for token in doc]  # Lemmatization
    pos_tags = [(token.text, token.pos_) for token in doc]  # POS Tagging
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]  # Named Entity Recognition (NER)

    # Word vectors (if available)
    word_vectors = {token.text: token.vector for token in doc if token.has_vector}

    return {
        "tokens": tokens,
        "lemmatized": lemmatized,
        "pos_tags": pos_tags,
        "ner": named_entities,
        "word_vectors": word_vectors
    }

# Apply function to the same sample tweet
spacy_results = process_spacy(sample_text)

print("SpaCy Tokenization:", spacy_results["tokens"])
print("SpaCy Lemmatization:", spacy_results["lemmatized"])
print("SpaCy POS Tags:", spacy_results["pos_tags"])
print("SpaCy Named Entities:", spacy_results["ner"])
print("SpaCy Word Vectors (first word):", list(spacy_results["word_vectors"].values())[0])


SpaCy Tokenization: ['know', 'family', 'drama', 'lamehey', 'next', 'time', 'u', 'hang', 'kim', 'n', 'u', 'guys', 'like', 'sleepover', 'whatever', 'ill', 'call', 'u']
SpaCy Lemmatization: ['know', 'family', 'drama', 'lamehey', 'next', 'time', 'u', 'hang', 'kim', 'n', 'u', 'guy', 'like', 'sleepover', 'whatever', 'ill', 'call', 'u']
SpaCy POS Tags: [('know', 'VERB'), ('family', 'NOUN'), ('drama', 'NOUN'), ('lamehey', 'PROPN'), ('next', 'ADJ'), ('time', 'NOUN'), ('u', 'PRON'), ('hang', 'VERB'), ('kim', 'PROPN'), ('n', 'PROPN'), ('u', 'PROPN'), ('guys', 'NOUN'), ('like', 'ADP'), ('sleepover', 'NOUN'), ('whatever', 'PRON'), ('ill', 'ADJ'), ('call', 'NOUN'), ('u', 'NOUN')]
SpaCy Named Entities: [('kim', 'PERSON')]
SpaCy Word Vectors (first word): [-0.16450804 -1.483369    0.42516464  0.59294677 -0.67554724 -0.349852
  0.9464916   0.80744666 -0.1556697   0.23997074  0.14656189  0.41497397
 -0.5950614   1.0855898  -0.47182953 -0.5323664   0.40713558 -1.5859385
  0.70478374 -0.18128091 -0.856852

**Text Preprocessing with Gensim**

Gensim provides built-in functions for tokenization, normalization, stemming, and lemmatization.

In [35]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, stem_text, preprocess_string, strip_punctuation, strip_numeric
#from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Custom text preprocessing function
def preprocess_gensim(text):
    text = text.lower()  # Normalize case
    text = remove_stopwords(text)  # Remove stopwords
    text = strip_punctuation(text)  # Remove punctuation
    text = strip_numeric(text)  # Remove numbers
    tokens = simple_preprocess(text)  # Tokenization
    stemmed = [stem_text(word) for word in tokens]  # Stemming
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return {
        "tokens": tokens,
        "stemmed": stemmed,
        "lemmatized": lemmatized
    }

# Apply to a sample tweet
sample_text = data["cleaned_text"].iloc[0]
gensim_results = preprocess_gensim(sample_text)

print("Gensim Tokenization:", gensim_results["tokens"])
print("Gensim Stemming:", gensim_results["stemmed"])
print("Gensim Lemmatization:", gensim_results["lemmatized"])


Gensim Tokenization: ['ahhh', 'hope', 'ok']
Gensim Stemming: ['ahhh', 'hope', 'ok']
Gensim Lemmatization: ['ahhh', 'hope', 'ok']


**Document Representation using Gensim**

We will convert tweets into high-dimensional **vector representations**.

In [36]:
from gensim.corpora import Dictionary

# Tokenize the dataset
data["gensim_tokens"] = data["cleaned_text"].apply(lambda x: simple_preprocess(x))

# Create a dictionary from the tokenized tweets
dictionary = Dictionary(data["gensim_tokens"])

# Convert tweets into Bag-of-Words (BoW) representation
corpus_bow = [dictionary.doc2bow(text) for text in data["gensim_tokens"]]

print("Sample BoW Representation:", corpus_bow[:2])  # Example of first 2 tweets


Sample BoW Representation: [[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1), (5, 1), (6, 1)]]


**Word Embeddings using Word2Vec**

Gensim supports Word2Vec and GloVe to learn word embeddings.

In [41]:
from gensim.models import Word2Vec

# Train Word2Vec model
w2v_model = Word2Vec(sentences=data["gensim_tokens"], vector_size=300, window=5, min_count=2, workers=4)

# Get vector for a word
word_vector = w2v_model.wv["happy"]  # Example word vector
print("Word Vector for 'happy':", word_vector[:10])  # Show first 10 values

Word Vector for 'happy': [ 0.42489234  1.1519021   0.920759    0.5596347   0.30815944 -0.49482748
 -0.4380642   0.62900376  0.8123519   0.21661776]


In [53]:
import numpy as np

# Function to convert a tweet (list of tokens) to a vector
def tweet_to_vector(tokens, model, vector_size=300):
    vectors = []
    for token in tokens:
        if token in model.wv:  # Check if the word is in the vocabulary
            vectors.append(model.wv[token])  # Get the word vector
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)  # Average the word vectors
    else:
        return np.zeros(vector_size)  # Return a zero vector if no words are found

# Apply the function to all tweets
data["tweet_vector"] = data["gensim_tokens"].apply(lambda x: tweet_to_vector(x, w2v_model))

In [54]:
# Print the first tweet vector
first_tweet_vector = data["tweet_vector"].iloc[0]
print("First Tweet Vector (first 10 values):", first_tweet_vector[:10])
print("Shape of the vector:", first_tweet_vector.shape)

First Tweet Vector (first 10 values): [-0.22619586  0.01662495 -0.08241827 -0.04154763  0.02368674 -0.22663213
 -0.14134313  0.8606906   0.06888402 -0.21909879]
Shape of the vector: (300,)


In [55]:
# Convert all tweet vectors to a NumPy array
tweet_vectors = np.array(data["tweet_vector"].tolist())

# Save to a file
np.save("tweet_vectors.npy", tweet_vectors)

In [58]:
# Print the first 5 tweet vectors
for i, vector in enumerate(tweet_vectors[:5]):
    print(f"Tweet {i+1} Vector (first 10 values):", vector[:10])
    print(f"Shape of the vector:", vector.shape)
    print("-" * 50)

Tweet 1 Vector (first 10 values): [-0.22619586  0.01662495 -0.08241827 -0.04154763  0.02368674 -0.22663213
 -0.14134313  0.86069059  0.06888402 -0.21909879]
Shape of the vector: (300,)
--------------------------------------------------
Tweet 2 Vector (first 10 values): [ 0.06694639 -0.46550232  0.1992816   0.0148522   0.25755888 -0.07808398
 -0.13612124  0.28458598 -0.08352678 -0.09251469]
Shape of the vector: (300,)
--------------------------------------------------
Tweet 3 Vector (first 10 values): [-0.03115662 -0.12817708  0.11884255 -0.12331461 -0.3552461  -0.17043146
 -0.16001394  0.67719871 -0.10042305  0.17600764]
Shape of the vector: (300,)
--------------------------------------------------
Tweet 4 Vector (first 10 values): [-0.09890971  0.03893366 -0.02794758  0.282543   -0.14405398 -0.2921595
 -0.53804374  0.46554863 -0.45270666  0.26336318]
Shape of the vector: (300,)
--------------------------------------------------
Tweet 5 Vector (first 10 values): [ 0.02924231 -0.2422614

# Apply Machine Learning Algorithms for Sentiment Analysis



1.   **Prepare Data for ML Models**



In [65]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features (X) and labels (y)
X = tweet_vectors  # Gensim Word2Vec vectors
y = data["target"].values  # Labels (0: Negative, 1: Positive)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr.predict(X_test)

# Evaluate
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Results:
Accuracy: 0.7466416666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74     59893
           1       0.74      0.76      0.75     60107

    accuracy                           0.75    120000
   macro avg       0.75      0.75      0.75    120000
weighted avg       0.75      0.75      0.75    120000



In [None]:
from sklearn.svm import SVC

# Train SVM
svm = SVC()
svm.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm.predict(X_test)

# Evaluate
print("SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf.predict(X_test)

# Evaluate
print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))