In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\administrateur\Downloads\tweets-data.csv")

# Sample 500 rows (for reproducibility, use random_state)
df_sample = df.sample(n=500, random_state=42).reset_index(drop=True)


In [5]:
print(df_sample.columns)

Index(['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet',
       'Tweets', 'hashtag'],
      dtype='object')


In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)                # Remove mentions and hashtags
    text = re.sub(r'\d+', '', text)                      # Remove digits
    text = re.sub(r'[^\w\s]', '', text)                  # Remove punctuation
    tokens = word_tokenize(text)
    cleaned_tokens = [w for w in tokens if w not in stop_words]
    return " ".join(cleaned_tokens)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df_sample["Tweets"] = df_sample["Tweets"].apply(clean_tweet)

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        sentiment = "positive"
    elif compound <= -0.05:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    return pd.Series([sentiment, compound])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\administrateur\AppData\Roaming\nltk_data...


In [12]:
df_sample[["sentiment_label", "sentiment_score"]] = df_sample["Tweets"].apply(get_sentiment_score)

In [15]:
print(df_sample[["Tweets", "sentiment_label", "sentiment_score"]].head(10))

                                              Tweets sentiment_label  \
0  le de sanaga ls sont morts comme ils ont vécu ...         neutral   
1                                                            neutral   
2                                  exclusive content        positive   
3  auch heute geht die politische nachricht des t...        negative   
4  type would take homemade playstationcontrolled...         neutral   
5                                                            neutral   
6  mishap incredible force amp speed crushing wat...        negative   
7                                                            neutral   
8                                   le le retour via         neutral   
9  il segretario di stato americano non credo che...         neutral   

   sentiment_score  
0           0.0000  
1           0.0000  
2           0.1280  
3          -0.5994  
4           0.0000  
5           0.0000  
6          -0.5859  
7           0.0000  
8           0.0000