In [10]:
#Load the libraries

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
#Load the dataset
df = pd.read_csv(r"C:\Users\user\Desktop\crowdflower-brands-and-product-emotions\judge-1377884607_tweet_product_company.csv", encoding='ISO-8859-1')
df.head()


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [12]:
#Check the last rows
df.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


The dataset values appear uniform from top to bottom.

In [13]:
#Check shape
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

The dataset has 9093 rows and 3 columns.


In [14]:
#check for the dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


Observation:
- The dataset primarily consists of text data(object type) across all columns.
- The column `emotion_in_tweet_is_directed_at` has many missing values only 3,291 out of 9,093 entries are non-null.
-The `tweet-text` column has only 1 missing value, which will be handled during cleaning.
- The dataset seems manageable in size as well.

In [15]:
# Check missing values and duplicates
print(df.isna().sum())
print(df.duplicated().sum())

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64
22


In [16]:
df.describe(include='object')

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9092,3291,9093
unique,9065,9,4
top,RT @mention Marissa Mayer: Google Will Connect...,iPad,No emotion toward brand or product
freq,5,946,5389


Observation:
- The dataset is dominated by tweets without strong emotions towards a brand/product.
-Only a subset of tweets approximately 37%(3,291 out of 9,093) tweets express an emotion directed at a brand.
- The presence of duplicates and missing values confirms that data cleaning will be required before modeling.

In [17]:
# Check Unique values per column
for col in df.columns:
    print(col,df[col].nunique())


tweet_text 9065
emotion_in_tweet_is_directed_at 9
is_there_an_emotion_directed_at_a_brand_or_product 4


Observation:

The dataset contains;
- 9,065 unique tweet texts, indicating minimal duplication in tweet content.
- 9 unique values in the `emotion_in_tweet_is_directed_at` column, representing different brands or products mentioned in the tweets.
- 4 unique values in the `is_there_an_emotion_directed_at_a_brand_or_product` column, showing the possible emotion labels assigned to each tweet.

In [18]:
#Check value counts for key columns
df["is_there_an_emotion_directed_at_a_brand_or_product"].value_counts()

is_there_an_emotion_directed_at_a_brand_or_product
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64

- Most tweets(5,389) express no emotion toward a brand or product.
- 2,978 tweets show positive emotion while 570 are negative
- A small number(156) are uncertain(I can't tell) indicating a class imbalance, with "No emotion" being the dominant category.

In [19]:
df["emotion_in_tweet_is_directed_at"].value_counts()

emotion_in_tweet_is_directed_at
iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: count, dtype: int64

- The most mentioned products are ipad and Apple
- Mentions of Google and its products are fewer but still significant. It shows that the dataset is skewed towards Apple-related tweets, which may influence model bias.

### Column Renaming
To simplify later analysis and code readability, we will rename the columns.

In [20]:
# Rename the columns
df = df.rename(columns={
    'tweet_text': 'text',
    'emotion_in_tweet_is_directed_at': 'brand',
    'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'
})

# Recheck column names
print("Columns after renaming:", df.columns.tolist())
df.head(3)


Columns after renaming: ['text', 'brand', 'sentiment']


Unnamed: 0,text,brand,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


## Data Preparation
### Data Cleaning

In [21]:
df.isnull().sum()

text            1
brand        5802
sentiment       0
dtype: int64

In [22]:
# Remove tweet with no text/sentiment
df = df.dropna(subset=["text", "sentiment"])

Tweets missing a `brand` will be retained, as it still express sentiment that might be relevant for our case.

In [23]:
# Check duplicates
df.duplicated().sum()

np.int64(22)

In [24]:
#Drop the duplicates
df.drop_duplicates(inplace=True)
#we wil then confirm the removal
print("Number of duplicate rows after removal:", df.duplicated().sum())
print("New dataset shape:", df.shape)


Number of duplicate rows after removal: 0
New dataset shape: (9070, 3)


In [25]:
# Save the cleaned dataset
df.to_csv("clean_apple_google_tweets.csv", index=False)

### Exploratory Data Analysis
#### Univariate Analysis

### Text Preprocessing

In [26]:
#Load the clean dataset
df = pd.read_csv("clean_apple_google_tweets.csv")
df.head()

Unnamed: 0,text,brand,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [27]:
# Text preprocessing

# Convert to lowercase
df["text"] = df["text"].str.lower()
print("Lowercased:", df["text"])




Lowercased: 0       .@wesley83 i have a 3g iphone. after 3 hrs twe...
1       @jessedee know about @fludapp ? awesome ipad/i...
2       @swonderlin can not wait for #ipad 2 also. the...
3       @sxsw i hope this year's festival isn't as cra...
4       @sxtxstate great stuff on fri #sxsw: marissa m...
                              ...                        
9065                        ipad everywhere. #sxsw {link}
9066    wave, buzz... rt @mention we interrupt your re...
9067    google's zeiger, a physician never reported po...
9068    some verizon iphone customers complained their...
9069    ï¡ïàü_êîò£áââ_£â_ûârt @...
Name: text, Length: 9070, dtype: object


In [28]:
# Remove URLs, mentions, Hashtags, Punctuation, Digits, Extraspaces
def clean_text(text):
    # remove URLs
    text = re.sub(r'http\S+', '', text)
		# remove mentions
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    text = re.sub(r'#\w+', '', text)
    # remove punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df["clean_text"] = df["text"].apply(clean_text)
df.head()

Unnamed: 0,text,brand,sentiment,clean_text
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iPhone,Negative emotion,i have a g iphone after hrs tweeting at it was...
1,@jessedee know about @fludapp ? awesome ipad/i...,iPad or iPhone App,Positive emotion,know about awesome ipadiphone app that youll l...
2,@swonderlin can not wait for #ipad 2 also. the...,iPad,Positive emotion,can not wait for also they should sale them do...
3,@sxsw i hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,i hope this years festival isnt as crashy as t...
4,@sxtxstate great stuff on fri #sxsw: marissa m...,Google,Positive emotion,great stuff on fri marissa mayer google tim or...


In [29]:
# Removing stopwords i.e common words that do not carry much meaning
stop_words = set(stopwords.words('english'))
df["clean_text"] = df["clean_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df.head(3)

Unnamed: 0,text,brand,sentiment,clean_text
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iPhone,Negative emotion,g iphone hrs tweeting dead need upgrade plugin...
1,@jessedee know about @fludapp ? awesome ipad/i...,iPad or iPhone App,Positive emotion,know awesome ipadiphone app youll likely appre...
2,@swonderlin can not wait for #ipad 2 also. the...,iPad,Positive emotion,wait also sale


In [30]:
# Lemmatization to reduce words to their base form to ensure uniformity
lemmatizer = WordNetLemmatizer()
df["clean_text"] = df["clean_text"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df.head(3)

Unnamed: 0,text,brand,sentiment,clean_text
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iPhone,Negative emotion,g iphone hr tweeting dead need upgrade plugin ...
1,@jessedee know about @fludapp ? awesome ipad/i...,iPad or iPhone App,Positive emotion,know awesome ipadiphone app youll likely appre...
2,@swonderlin can not wait for #ipad 2 also. the...,iPad,Positive emotion,wait also sale


In [31]:
# tokenization to split each tweet into individual word-tokens to be used in feature extraction 
df["tokens"] = df["clean_text"].apply(word_tokenize)
df[["clean_text", "tokens"]].head()

Unnamed: 0,clean_text,tokens
0,g iphone hr tweeting dead need upgrade plugin ...,"[g, iphone, hr, tweeting, dead, need, upgrade,..."
1,know awesome ipadiphone app youll likely appre...,"[know, awesome, ipadiphone, app, youll, likely..."
2,wait also sale,"[wait, also, sale]"
3,hope year festival isnt crashy year iphone app,"[hope, year, festival, isnt, crashy, year, iph..."
4,great stuff fri marissa mayer google tim oreil...,"[great, stuff, fri, marissa, mayer, google, ti..."


#### Feature Extraction(Converting Text to Numbers)

In [32]:
# Using TF-IDF(Term Frequency-Inverse Document Frequency)

# initialize the vectorizer
tfidf = TfidfVectorizer()

# fit and transform cleaned text
X = tfidf.fit_transform(df["clean_text"])

print("Shape of TF-IDF matrix:" , X.shape)
print("-------"*15)
print("TF-IDF Vocabulary:", tfidf.get_feature_names_out())
print("-------"*15)
print("Vocabulary size:", len(tfidf.get_feature_names_out()))
print("-------"*15)
print("TF-IDF Representation:", X.toarray())


Shape of TF-IDF matrix: (9070, 8388)
---------------------------------------------------------------------------------------------------------
TF-IDF Vocabulary: ['aapl' 'aaron' 'ab' ... 'zuckerberg' 'zynga' 'zzzs']
---------------------------------------------------------------------------------------------------------
Vocabulary size: 8388
---------------------------------------------------------------------------------------------------------
TF-IDF Representation: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


TF-IDF assigns higher weight to words that are important in a specific tweet but rare across all tweets. It also helps the model focus on meaningful words that carry sentiments.


In [33]:
# Using CountVectorizer or Bag of Words(BoW)

#Initialize the vectorizer
vectorizer = CountVectorizer()

#Fit and transform cleaned text
X_bow = vectorizer.fit_transform(df["clean_text"])

print("Shape of Bag-of-Words matrix:", X_bow.shape)
print("-------"*15)
print("Bag-of-Words Vocabulary:", vectorizer.get_feature_names_out())
print("-------"*15)
print("Vocabulary size:", len(vectorizer.get_feature_names_out()))
print("-------"*15)
print("Bag-of-Words Representation:", X_bow.toarray())

Shape of Bag-of-Words matrix: (9070, 8388)
---------------------------------------------------------------------------------------------------------
Bag-of-Words Vocabulary: ['aapl' 'aaron' 'ab' ... 'zuckerberg' 'zynga' 'zzzs']
---------------------------------------------------------------------------------------------------------
Vocabulary size: 8388
---------------------------------------------------------------------------------------------------------
Bag-of-Words Representation: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Bag-of-Words (CountVectorizer) captures the raw frequency of words in each tweet, and is useful as a baseline model providing a straightforward representation of text.

Using both approches allows comparison of feature representations and model performance. While TF-IDF often yields better results for sentiment classification, BoW provides a simple, interpretable baseline.