# BUSINESS UNDERSTANDING

# DATA UNDERSTANDING

In [381]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize


In [382]:
# Loading the dataset
data = pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding = 'latin1')
data

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [383]:
# Checking the dataset's head
data.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [384]:
# Checking the dataset's tail
data.tail(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9083,"Google says the future is all around you! (ie,...",,No emotion toward brand or product
9084,"Google says the future is location, location, ...",,No emotion toward brand or product
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion
9086,Google says: want to give a lightning talk to ...,,No emotion toward brand or product
9087,"@mention Yup, but I don't have a third app yet...",,No emotion toward brand or product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [385]:
# Checking the shape
data.shape

(9093, 3)

In [386]:
# Description
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9092,3291,9093
unique,9065,9,4
top,RT @mention Marissa Mayer: Google Will Connect...,iPad,No emotion toward brand or product
freq,5,946,5389


In [387]:
# Checking for duplicates
data.duplicated().sum()

22

In [388]:
# Checking for missing values
data.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

# DATA PREPARATION

General steps:
- Rename columns
- Remove missing value from tweet_text
- Handle the duplicated data
- Remove Capitalization
- Remove Punctuation
- Remove stop words
- Use stemming or lemmatization
- Fill emotion_in_tweet_is_directed_at with their appropriate values

In [389]:
# Rename the columns
data.rename(columns={"tweet_text":"tweet","emotion_in_tweet_is_directed_at":"brand",
                    "is_there_an_emotion_directed_at_a_brand_or_product":"sentiment"},inplace=True)
data.head()

Unnamed: 0,tweet,brand,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [390]:
# Drop missing row in tweet column 
data.dropna(subset=["tweet"],inplace=True,axis=0)
data.isna().sum()

tweet           0
brand        5801
sentiment       0
dtype: int64

In [391]:
# Drop the duplicated data
data.drop_duplicates(inplace=True)
# Check if the duplicates have been dropped
data.duplicated().sum()

0

In [392]:
# Remove Capitalization in tweets
data["tweet"] = data["tweet"].str.lower()
# Preview data
data.head()

Unnamed: 0,tweet,brand,sentiment
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iPhone,Negative emotion
1,@jessedee know about @fludapp ? awesome ipad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin can not wait for #ipad 2 also. the...,iPad,Positive emotion
3,@sxsw i hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on fri #sxsw: marissa m...,Google,Positive emotion


In [393]:
# Remove user names from tweets
def user_name(df):
    """
    This function removes the user names and hyperlinks from tweets column
    """
    text = []
    for words in df.tweet:
        text.append(words)
    texts = []
    for x in range(len(text)):
        users_removed = " ".join(word for word in text[x].split() if not word.startswith(("@",".","http:","{link}")))
        texts.append(users_removed)
    return texts

In [394]:
# Place the data without user names into a dataframe
clean_data = pd.DataFrame(user_name(data),columns=["tweet"])
clean_data["brand"] = data["brand"]
clean_data["sentiment"] = data["sentiment"]
clean_data.head()

Unnamed: 0,tweet,brand,sentiment
0,i have a 3g iphone. after 3 hrs tweeting at #r...,iPhone,Negative emotion
1,know about ? awesome ipad/iphone app that you'...,iPad or iPhone App,Positive emotion
2,can not wait for #ipad 2 also. they should sal...,iPad,Positive emotion
3,i hope this year's festival isn't as crashy as...,iPad or iPhone App,Negative emotion
4,great stuff on fri #sxsw: marissa mayer (googl...,Google,Positive emotion


In [395]:
# Remove  punctuation
clean_data['tweet'] = clean_data.tweet.str.replace('[^\w\s]','')
clean_data.head()

Unnamed: 0,tweet,brand,sentiment
0,i have a 3g iphone after 3 hrs tweeting at ris...,iPhone,Negative emotion
1,know about awesome ipadiphone app that youll ...,iPad or iPhone App,Positive emotion
2,can not wait for ipad 2 also they should sale ...,iPad,Positive emotion
3,i hope this years festival isnt as crashy as t...,iPad or iPhone App,Negative emotion
4,great stuff on fri sxsw marissa mayer google t...,Google,Positive emotion


In [396]:
# Remove stop words and punctuation
stopwords = stopwords.words('english')
stopwords += list(string.punctuation)
clean_data['tweet'] = clean_data.tweet.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))


In [397]:
# Remove Special characters
clean_data['tweet'] = clean_data.tweet.str.replace('#','')
clean_data['tweet'] = clean_data.tweet.str.replace('@','')

In [398]:
# Priview the data
clean_data.head(10)

Unnamed: 0,tweet,brand,sentiment
0,3g iphone 3 hrs tweeting rise_austin dead need...,iPhone,Negative emotion
1,know awesome ipadiphone app youll likely appre...,iPad or iPhone App,Positive emotion
2,wait ipad 2 also sale sxsw,iPad,Positive emotion
3,hope years festival isnt crashy years iphone a...,iPad or iPhone App,Negative emotion
4,great stuff fri sxsw marissa mayer google tim ...,Google,Positive emotion
5,new ipad apps speechtherapy communication show...,,No emotion toward brand or product
6,sxsw starting ctia around corner googleio hop ...,,
7,beautifully smart simple idea rt wrote hollerg...,Android,Positive emotion
8,counting days sxsw plus strong canadian dollar...,iPad or iPhone App,Positive emotion
9,excited meet sxsw show sprint galaxy still run...,Apple,Positive emotion


In [399]:
# Change the brand names to either Apple or Google
brands = {
    "iPhone":"Apple",
    "iPad": "Apple",
    "Android": "Google",
    "Android App": "Google",
    "iPad or iPhone App": "Apple",
    "Other Google product or service": "Google",
    "Other Apple product or service": "Apple"
}
clean_data.replace({"brand":brands},inplace=True)

clean_data.head()

Unnamed: 0,tweet,brand,sentiment
0,3g iphone 3 hrs tweeting rise_austin dead need...,Apple,Negative emotion
1,know awesome ipadiphone app youll likely appre...,Apple,Positive emotion
2,wait ipad 2 also sale sxsw,Apple,Positive emotion
3,hope years festival isnt crashy years iphone a...,Apple,Negative emotion
4,great stuff fri sxsw marissa mayer google tim ...,Google,Positive emotion


In [401]:
clean_data.brand.value_counts()

Apple     2397
Google     877
Name: brand, dtype: int64

In [425]:
# Fill the column in brand with their respective values
dict = {
    "iphone": "Apple",
    "android": "Google",
    "google": "Google",
    "ipad": "Apple",
    "ipad/iphone": "Apple",
    "apple": "Apple",
    "itunes": "Apple"
}

for key,value in dict.items():
    clean_data["brand"][clean_data.tweet.str.contains(key)] = value
clean_data.brand.value_counts()

Apple     5626
Google    2988
Name: brand, dtype: int64

In [430]:
# Check null values
clean_data.brand.isna().sum()

# Replace null values with Others
clean_data.brand.fillna("Others",inplace=True)


# FEATURE ENGINEERING

# MODELING

# EVALUATION

# DEPLOYMENT