Apple versus Google NLP project

In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.pardir)
#print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)
    
#from src.confusion import plot_confusion_matrix

In [2]:
df = pd.read_csv("data/judge-1377884607_tweet_product_company.csv", encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
df["emotion_in_tweet_is_directed_at"].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [6]:
df["is_there_an_emotion_directed_at_a_brand_or_product"].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [7]:
df["tweet_text"].iloc[5]

'@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd'

In [8]:
#keywords TBD will review sentiment
apple_keywords = ['iPad', 'IPad', 'i-Pad', 'IPAD', 'iphone', 'iPhone', 'Iphone', 'i-phone', 'I-Phone', 'Apple', 'apple', 'Ipad', 'ipad', 'iTunes', 'Mac', 'IPhone', 'IPHONE']
google_keywords = ['google', 'android', 'Google', 'Android', 'samsung', 'GOOGLE']

def find_brand(text):
    if isinstance(text, str):
        for word in apple_keywords:
            if word in text:
                return "Apple" 
        for word in google_keywords:
            if word in text:
                return "Google" 
    return 'No_Product'

df['product'] = df['tweet_text'].apply(find_brand)

In [9]:
df['product'].value_counts()

Apple         5587
Google        2771
No_Product     735
Name: product, dtype: int64

In [10]:
df['product'].isna().sum()

0

In [11]:
df['product'].head(500)

0           Apple
1           Apple
2           Apple
3           Apple
4          Google
          ...    
495         Apple
496         Apple
497    No_Product
498         Apple
499         Apple
Name: product, Length: 500, dtype: object

In [12]:
df['tweet_text'][497]

'Get #SXSW film red carpet coverage from @mention CW Austin Star Mandy Dugan on Grouped{in}, get the app {link}'

In [13]:
mapping = {"Positive emotion": 2, "No emotion toward brand or product": 1, "I can't tell": 1, "Negative emotion": 0}
df['emotion'] = df["is_there_an_emotion_directed_at_a_brand_or_product"].map(mapping)


In [14]:
df["emotion"].value_counts()

1    5545
2    2978
0     570
Name: emotion, dtype: int64

In [15]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple,2
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple,2
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google,2


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 5 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   product                                             9093 non-null   object
 4   emotion                                             9093 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 355.3+ KB


In [17]:

#use 1 to show all info in cell below then comment it out and use 2 to go back to default view
#1
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#2
#pd.reset_option('display.max_rows')
#pd.reset_option('display.max_colwidth')

In [18]:
#exploring 'No_Product' tweets
filtered_df = df[df['product'] == 'No_Product']

print(filtered_df['tweet_text'])

6                                                                                                                                                                  NaN
51                                    ÛÏ@mention {link} &lt;-- HELP ME FORWARD THIS DOC to all Anonymous accounts, techies,&amp; ppl who can help us JAM #libya #SXSW
52                                                                                     ÷¼ WHAT? ÷_ {link} ã_ #edchat #musedchat #sxsw #sxswi #classical #newTwitter
53                                                                        .@mention @mention on the location-based 'fast, fun and future' - {link} (via @mention #sxsw
65                       Agree. RT @mention Wait. FIONA APPLE is in town??? Somebody kidnap her and put her in a recording studio until she records a new album. #sxsw
66                                                                            At #sxsw? @mention / @mention wanna buy you a drink. 7pm at Fado on 4th. {link} Join us

In [19]:
print(df['tweet_text'][65])
print(df['tweet_text'][3079])
print(df['tweet_text'][4478])
#ill have to manually sort these

Agree. RT @mention Wait. FIONA APPLE is in town??? Somebody kidnap her and put her in a recording studio until she records a new album. #sxsw
Can I tweet this if I only use APPLE :) RT @mention Microsoft's DPE will b SXSWi &amp; Tweeting @mention #microsoft &amp; #sxsw
Wait. FIONA APPLE is in town??? Somebody kidnap her and put her in a recording studio until she records a new album. #sxsw


In [20]:
#not sure about this prob nothing
print(df.iloc[7013])

tweet_text                                            FOUND iPHONE: Ballroom D just now. Volunteer at door has it. #sxsw #sxswi
emotion_in_tweet_is_directed_at                                                                                             NaN
is_there_an_emotion_directed_at_a_brand_or_product                                           No emotion toward brand or product
product                                                                                                              No_Product
emotion                                                                                                                       1
Name: 7013, dtype: object


combine emotion_in_tweet_is_directed_at into two options (apple and google)
drop row 6 - missing tweet
update is_there_an_emotion_directed_at_a_brand_or_product to numeric (0=neg, 1=neutral, 2=pos)
consider adding SWSW to stop words 
look at @, is it dropped in punctuation. do we want it dropped?
look at #?

Additional notes: Sansung and Android included with Google

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 5 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   product                                             9093 non-null   object
 4   emotion                                             9093 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 355.3+ KB


In [22]:
df2 = df[['tweet_text','product','emotion']]

In [23]:
df2_apple = df2[df2['product'] == 'Apple']

In [24]:
df2_apple.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5587 entries, 0 to 9091
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  5587 non-null   object
 1   product     5587 non-null   object
 2   emotion     5587 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 174.6+ KB


In [25]:
df2_apple['emotion'].value_counts()

1    3021
2    2144
0     422
Name: emotion, dtype: int64

In [26]:
df2_google = df2[df2['product'] == 'Google']

In [27]:
df2_google['emotion'].value_counts(normalize=True)

1    0.656081
2    0.290870
0    0.053049
Name: emotion, dtype: float64

Looking at Synthetic Tweets

In [39]:
df_new_tweets = pd.read_excel("data/Tweets_SyntheticGeneration_20240503.xlsx")

In [40]:
df_new_tweets

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,"1. ""Bought the latest iPhone, and it's already glitching. Apple, you really need to get your act together! #disappointed""","""Bought the latest iPhone, and it's already glitching. Apple, you really need to get your act together! #disappointed"""
1,"1. ""Just got the new iPhone, and it's already crashing apps left and right. Seriously, Apple, get your act together! #frustrated""",
2,"1. ""Just tried out the original iPhone... what a disappointment! The touch screen is so unresponsive, and the camera quality is terrible. Can't believe people actually bought into this hype. #iPhone #fail""",
3,"1. ""Just tried the iPad 2 at #SXSW... disappointed doesn't even begin to describe it. Same old design, same old issues. #iPad2 #letdown""",
4,"1. ""The iPod Nano 3rd Generation was a disappointment. Sure, it had video playback, but the lack of camera and the square design made it awkward to use, and the battery life was poor. #iPodNano3 #disappointing""",
5,"1. ""The Power Mac G4 Cube may look cool, but it's loud, slow, and prone to overheating. Definitely not worth the investment. #PowerMacG4Cube #disappointed""",
6,"1. Just got the latest iPhone, and it's already slower than my old one. Thanks, Apple. #SlowPhone",
7,1. Just had my daily dose of frustration trying to connect my iPhone to my MacBook. Can these devices not communicate or what?,
8,"1. Just when I thought my MacBook couldn't get any slower, it decides to take a coffee break every time I open an app. #MacBookMeltdown",
9,"1. Just when I thought my MacBook keyboard couldn't get any worse, it decides to malfunction on a deadline day. Thanks for the stress, Apple. #KeyboardNightmare",


In [41]:
df_new_tweets.columns = ['tweet_text', 'extra_col']

In [42]:
df_new_tweets_clean = df_new_tweets.drop_duplicates(subset='tweet_text')

In [43]:
df_new_tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1719 entries, 0 to 1767
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  1719 non-null   object
 1   extra_col   1 non-null      object
dtypes: object(2)
memory usage: 40.3+ KB


In [47]:
df_new_tweets_clean = df_new_tweets_clean.drop(labels='extra_col', axis=1)

In [48]:
df_new_tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1719 entries, 0 to 1767
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  1719 non-null   object
dtypes: object(1)
memory usage: 106.9+ KB


In [49]:
df_new_tweets_clean['product'] = 'Apple'

In [50]:
df_new_tweets_clean.head()

Unnamed: 0,tweet_text,product
0,"1. ""Bought the latest iPhone, and it's already glitching. Apple, you really need to get your act together! #disappointed""",Apple
1,"1. ""Just got the new iPhone, and it's already crashing apps left and right. Seriously, Apple, get your act together! #frustrated""",Apple
2,"1. ""Just tried out the original iPhone... what a disappointment! The touch screen is so unresponsive, and the camera quality is terrible. Can't believe people actually bought into this hype. #iPhone #fail""",Apple
3,"1. ""Just tried the iPad 2 at #SXSW... disappointed doesn't even begin to describe it. Same old design, same old issues. #iPad2 #letdown""",Apple
4,"1. ""The iPod Nano 3rd Generation was a disappointment. Sure, it had video playback, but the lack of camera and the square design made it awkward to use, and the battery life was poor. #iPodNano3 #disappointing""",Apple


In [51]:
import re # importing regular expressions

In [52]:
# Define a regular expression pattern to match any number followed by a period and a space
pattern = r'^\d+\.\s'

In [56]:
#Need to remove the numbers in front of the tweets

df_new_tweets_clean['tweet_text'][3]

'1. "Just tried the iPad 2 at #SXSW... disappointed doesn\'t even begin to describe it. Same old design, same old issues. #iPad2 #letdown"'

In [58]:
# Iterate through the dataframe column and remove the matching pattern using regex
df_new_tweets_clean['tweet_text'] = df_new_tweets_clean['tweet_text'].map(lambda x: re.sub(pattern, '', x))

In [59]:
df_new_tweets_clean

Unnamed: 0,tweet_text,product
0,"""Bought the latest iPhone, and it's already glitching. Apple, you really need to get your act together! #disappointed""",Apple
1,"""Just got the new iPhone, and it's already crashing apps left and right. Seriously, Apple, get your act together! #frustrated""",Apple
2,"""Just tried out the original iPhone... what a disappointment! The touch screen is so unresponsive, and the camera quality is terrible. Can't believe people actually bought into this hype. #iPhone #fail""",Apple
3,"""Just tried the iPad 2 at #SXSW... disappointed doesn't even begin to describe it. Same old design, same old issues. #iPad2 #letdown""",Apple
4,"""The iPod Nano 3rd Generation was a disappointment. Sure, it had video playback, but the lack of camera and the square design made it awkward to use, and the battery life was poor. #iPodNano3 #disappointing""",Apple
5,"""The Power Mac G4 Cube may look cool, but it's loud, slow, and prone to overheating. Definitely not worth the investment. #PowerMacG4Cube #disappointed""",Apple
6,"Just got the latest iPhone, and it's already slower than my old one. Thanks, Apple. #SlowPhone",Apple
7,Just had my daily dose of frustration trying to connect my iPhone to my MacBook. Can these devices not communicate or what?,Apple
8,"Just when I thought my MacBook couldn't get any slower, it decides to take a coffee break every time I open an app. #MacBookMeltdown",Apple
9,"Just when I thought my MacBook keyboard couldn't get any worse, it decides to malfunction on a deadline day. Thanks for the stress, Apple. #KeyboardNightmare",Apple


## Need to implement VADER for 'emotion column'