In [None]:
import snscrape.modules.twitter as sntwitter  # only works with python 3.11 or lower
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import emoji
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from decouple import config
import tweepy
from datetime import datetime, timedelta, timezone
import urllib
import requests

In [None]:
# Load environment variables and Authenticate via tweepy

BEARER_TOKEN = config("BEARER_TOKEN")

# Create a Tweepy Client instance for API v2
# This is the modern and recommended way to use Tweepy
try:
    client = tweepy.Client(BEARER_TOKEN)
    print("Authentication successful!")
except Exception as e:
    print(f"Error during authentication: {e}")
    # If authentication fails, we stop here.
    # Make sure your Bearer Token is correct.
    client = None

Authentication successful!


In [15]:
raw_query = "#ElectricVehicles lang:en -is:retweet"
query = urllib.parse.quote(raw_query)

max_results = 10  # Max allowed in one request under Free tier

In [16]:
# tweets metadata
tweet_fields = "tweet.fields=created_at,author_id,lang,public_metrics,conversation_id"
user_fields = "user.fields=username,name,created_at,public_metrics,verified"
expansions = "expansions=author_id"

url = (
    f"https://api.twitter.com/2/tweets/search/recent"
    f"?query={query}&max_results={max_results}&{tweet_fields}&{user_fields}&{expansions}"
)

headers = {
    "Authorization": f"Bearer {BEARER_TOKEN}",
    "User-Agent": "v2RecentSearchPython",
}

In [17]:
response = requests.get(url, headers=headers)
if response.status_code != 200:
    raise Exception(f"Request returned an error: {response.status_code} {response.text}")
else:
    data = response.json()

users = {u["id"]: u for u in data.get("includes", {}).get("users", [])}

results = []
for tweet in data.get("data", []):
    user = users.get(tweet["author_id"], {})
    tweet_metrics = tweet.get("public_metrics", {})
    user_metrics = user.get("public_metrics", {})

    results.append({
        "tweet_id": tweet["id"],
        "text": tweet["text"],
        "created_at": tweet["created_at"],
        "language": tweet["lang"],
        "conversation_id": tweet.get("conversation_id"),
        "retweets": tweet_metrics.get("retweet_count"),
        "likes": tweet_metrics.get("like_count"),
        "replies": tweet_metrics.get("reply_count"),
        "quotes": tweet_metrics.get("quote_count"),
        "author_id": tweet["author_id"],
        "username": user.get("username"),
        "name": user.get("name"),
        "user_created_at": user.get("created_at"),
        "verified": user.get("verified"),
        "followers": user_metrics.get("followers_count"),
        "following": user_metrics.get("following_count"),
        "tweet_count": user_metrics.get("tweet_count")
    })

df = pd.DataFrame(results)
df.to_csv("ev_tweets.csv", index=False)
df.head()

Exception: Request returned an error: 429 {"title":"Too Many Requests","detail":"Too Many Requests","type":"about:blank","status":429}

In [19]:
# --- SEARCHING FOR TWEETS ---

if client:
    # 1. Define search query.
    # You can use keywords, hashtags, or more complex rules.
    # Search for tweets about #EVs.
    # Add '-is:retweet' to exclude retweets and get more original content.
    # We also specify 'lang:en' to get English-language tweets.
    query = "#ElectricVehicles -is:retweet lang:en"

    # 2. Use the search_recent_tweets method.
    # - The 'query' parameter is what we're searching for.
    # - The 'max_results' parameter specifies how many tweets to return (10 to 100).
    #   Let's fetch 10.
    try:
        response = client.search_recent_tweets(query=query, max_results=10)

        # The response object contains the data and other metadata.
        # The actual tweets are in response.data.
        tweets = response.data

        # 3. Check if any tweets were found.
        if tweets:
            print(f"\nSuccessfully fetched {len(tweets)} tweets.")

            # 4. Create a list to hold the tweet text.
            tweet_texts = []
            for tweet in tweets:
                tweet_texts.append(tweet.text)

            # 5. Convert the list into a pandas DataFrame.
            # This is the format you'll need for the next steps of your assignment.
            df = pd.DataFrame(tweet_texts, columns=["tweet"])

            # Display the first few tweets
            print("\n--- Fetched Tweets ---")
            print(df.head())

            # 6. Save the tweets to a CSV file for later use.
            # This is a good practice so you don't have to fetch them every time.
            df.to_csv("ev_tweets.csv", index=False)
            print("\nTweets saved to ev_tweets.csv")

        else:
            print("No tweets found for your query. Try a different keyword or hashtag.")
    
    except tweepy.errors.TweepyException as e:
        # Handles API-specific errors, like the 429 Too Many Requests
        print(f"A Tweepy error occurred: {e}")
    except ConnectionError as e:
        # Specifically catches network-related errors like the one you saw
        print(f"A network connection error occurred: {e}")
        print("This is often temporary. Please check your internet connection and try again in a few moments.")
    except Exception as e:
        # Catches any other unexpected errors
        print(f"An unexpected error occurred: {e}")


Successfully fetched 10 tweets.

--- Fetched Tweets ---
                                               tweet
0  Market Share of Top Electric 2-Wheeler Compani...
1  CATL has started buying lithium ore from exter...
2  “#ElectricVehicles bring health benefits &amp;...
3  “#ElectricVehicles bring health benefits &amp;...
4  “#ElectricVehicles bring health benefits and c...

Tweets saved to ev_tweets.csv


## 2. Data Cleaning and Preparation

Clean and prepare the tweet text data for sentiment analysis.
The steps include:

1. Removing URLs, mentions, hashtags (or removing the ‘#’ but keeping the word), emojis and special characters
2. Converting text to lowercase
3. Tokenizing the text
4. Removing stop-words
5. Lemmatizing the tokens
6. Optionally rebuilding the cleaned text field


In [None]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))


# A function to remove URLs, mentions, hashtags (#), emojis, special chars
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text)
    # Remove mentions (@username)
    text = re.sub(r"@\w+", "", text)
    # Remove hashtag symbol (keep the word) – e.g. "#ElectricVehicles" → "ElectricVehicles"
    text = re.sub(r"#", "", text)
    # Demojize (optional) or simply remove emojis
    text = emoji.replace_emoji(text, replace="")
    # Remove punctuation and special characters
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Convert to lowercase
    text = text.lower()
    return text


# A function to tokenize, remove stopwords, and lemmatize
def tokenize_lemmatize(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and tokens of length <= 1
    tokens = [tok for tok in tokens if tok not in stop_words and len(tok) > 1]
    # Lemmatize using spaCy
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc]
    return lemmas


# Example usage: load the data
df = pd.read_csv("tweets_evs.csv")  # make sure your filename matches
# Apply cleaning
df["clean_text"] = df["Tweet"].apply(clean_tweet_text)
df["tokens"] = df["clean_text"].apply(tokenize_lemmatize)
# Optionally join tokens back to a cleaned string
df["cleaned_joined"] = df["tokens"].apply(lambda x: " ".join(x))

# Show the first few cleaned rows
df.head()

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

### Explanation of cleaning steps:

- We start by removing URLs and mentions because they don’t contribute to sentiment in most cases and may bias the text.
- We remove the “#” symbol but keep the keyword after it so that hashtags still contribute their meaning (for example, ‘ElectricVehicles’ becomes a regular word).
- Emojis can carry sentiment but for simplicity we remove them here (you could instead convert emojis to text meaning using `emoji.demojize`).
- Punctuation and special characters are removed to reduce noise.
- Lowercasing ensures that words like “EVs”, “evs”, and “Evs” are treated the same.
- Tokenization breaks the text into individual tokens/words.
- Stop-words removal strips out very common words (“the”, “is”, “and”) that don’t carry sentiment by themselves.
- Lemmatization reduces words to their base form (“running” → “run”, “better” → “good”), which helps with grouping similar concepts.
- Finally, we rebuild a cleaned version of the text for downstream analysis (sentiment scoring, word-cloud, etc).

```python
# Save cleaned data (optional)
df.to_csv("tweets_evs_cleaned.csv", index=False)
```


## 3. Sentiment Analysis

In this section, we will analyze the sentiment of our cleaned tweets using three different approaches:

1. **TextBlob** – simple polarity-based sentiment scoring
2. **VADER (Valence Aware Dictionary for sEntiment Reasoning)** – optimized for social media text
3. _(Optional)_ **Transformer model (HuggingFace)** – a pretrained deep learning model for more accurate sentiment classification

For each tweet, we will compute its sentiment polarity and then visualize the distribution of sentiments (positive, neutral, negative).


In [None]:
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
df = pd.read_csv("tweets_evs_cleaned.csv")

# Compute polarity: range is [-1.0, 1.0]
df["polarity"] = df["cleaned_joined"].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity
)


# Define sentiment categories
def get_sentiment_label(p):
    if p > 0.05:
        return "Positive"
    elif p < -0.05:
        return "Negative"
    else:
        return "Neutral"


df["sentiment"] = df["polarity"].apply(get_sentiment_label)

# Display sample results
df[["cleaned_joined", "polarity", "sentiment"]].head()

### Visualizing sentiment distribution


In [None]:
# Sentiment distribution
sns.set(style="whitegrid")
plt.figure(figsize=(7, 5))
sns.countplot(x="sentiment", data=df, order=["Positive", "Neutral", "Negative"])
plt.title("Sentiment Distribution (TextBlob)")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.show()

# Average polarity
avg_sentiment = df["polarity"].mean()
print(f"Average Sentiment Polarity: {avg_sentiment:.3f}")

#### Using VADER (Better for social media text)


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply VADER to each tweet
df["vader_scores"] = df["cleaned_joined"].apply(
    lambda x: analyzer.polarity_scores(str(x))["compound"]
)

# Convert to categorical sentiment
df["vader_sentiment"] = df["vader_scores"].apply(
    lambda s: "Positive" if s > 0.05 else ("Negative" if s < -0.05 else "Neutral")
)

# Visualization
plt.figure(figsize=(7, 5))
sns.countplot(
    x="vader_sentiment",
    data=df,
    order=["Positive", "Neutral", "Negative"],
    palette="pastel",
)
plt.title("Sentiment Distribution (VADER)")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.show()

# Average sentiment score
avg_vader = df["vader_scores"].mean()
print(f"Average VADER Sentiment Score: {avg_vader:.3f}")

### Interpretation of Results

From the TextBlob and VADER analyses, we can observe that:

- The majority of tweets around **#ElectricVehicles** are positive, showing enthusiasm about EV technology and sustainability.
- Neutral tweets mostly share factual updates (e.g., government policy or product announcements).
- Negative tweets often discuss high costs, range anxiety, or charging challenges.

The VADER model produced slightly more neutral classifications compared to TextBlob, which tends to exaggerate positive sentiment. The optional Transformer model provides more context-aware scoring and aligns more closely with nuanced opinions.


## 4. Word Cloud Visualization

To complement the sentiment analysis, we will visualize the most frequent words used in the tweets through a **word cloud**.  
A word cloud shows words sized proportionally to their frequency — larger words appear more often in the dataset.

This helps identify common themes or discussion points around our topic (**#ElectricVehicles**).


In [None]:
from wordcloud import WordCloud, STOPWORDS

# Combine all cleaned text
text_all = " ".join(df["cleaned_joined"].astype(str))

# Add default and custom stopwords to filter out uninformative words
stopwords = set(STOPWORDS)
custom_stopwords = {"https", "co", "rt", "amp"}  # common Twitter noise
stopwords.update(custom_stopwords)

# Generate the word cloud
wordcloud = WordCloud(
    width=1000,
    height=600,
    background_color="white",
    stopwords=stopwords,
    collocations=False,
    max_words=150,
).generate(text_all)

# Display it
plt.figure(figsize=(12, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in #ElectricVehicles Tweets", fontsize=14)
plt.show()

### Interpretation of Themes

- Words such as **“tesla”, “battery”, “charging”, “price”, “sustainability”** may appear most frequently, reflecting public focus on technology, cost, and environmental impact.
- If policy-related terms appear (e.g., “subsidy”, “government”, “incentive”), they indicate discussion around regulation and infrastructure.
- The presence of **positive** words (e.g., “love”, “future”, “innovation”) or **negative** ones (e.g., “expensive”, “problem”, “delay”) complements the sentiment analysis findings.

### Summary of Step 4

The word cloud provides a quick visual summary of the conversation landscape and helps identify keywords driving positive or negative sentiment.  
Together with Step 3’s sentiment plots, it gives a comprehensive overview of public mood toward **#ElectricVehicles**.
