# Section 1: Import Data Sets

In [1]:
# import packages
import json
from collections import Counter
import pandas as pd
import re
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Load XboxOne tweets
Xboxdata = []
for line in open('data/twitterdataXbox.json'):
    try:
        Xboxdata.append(json.loads(line))
    except:
        pass
print(len(Xboxdata))

FileNotFoundError: [Errno 2] No such file or directory: 'data/twitterdataXbox.json'

In [None]:
# Load PS4 tweets
PS4data = []
for line in open('data/twitterdataPS4.json'):
    try:
        PS4data.append(json.loads(line))
    except:
        pass
print(len(PS4data))

# Section 2: Text Processing

In [None]:
# Number of Xbox 1 tweets
Xboxtweets = []
for T in Xboxdata:
    if 'text' in T:
        Xboxtweets.append(T)
len(Xboxtweets)   

In [None]:
# Number of PS4 tweets
PS4tweets = []
for T in PS4data:
    if 'text' in T:
        PS4tweets.append(T)
len(PS4tweets) 

In [None]:
#extract info from tweets
Xboxids = [T['id_str'] for T in Xboxtweets]
Xboxtimes = [T['created_at'] for T in Xboxtweets]
Xboxtexts = [T['text'] for T in Xboxtweets]
Xboxscreen_names = [T['user']['screen_name'] for T in Xboxtweets]
Xboxfollowers_count = [T['user']['followers_count'] for T in Xboxtweets]
Xboxfriends_count = [T['user']['friends_count'] for T in Xboxtweets]
Xboxnames = [T['user']['name'] for T in Xboxtweets]
Xboxlons = [(T['geo']['coordinates'][1] if T['geo'] else None) for T in Xboxtweets]
Xboxplace_names = [(T['place']['full_name'] if T['place'] else None) for T in Xboxtweets]
Xboxplace_types = [(T['place']['place_type'] if T['place'] else None) for T in Xboxtweets]
Xboxlocation = [status['user']['location'] for status in Xboxtweets if 'user' in status]

In [None]:
PS4ids = [T['id_str'] for T in PS4tweets]
PS4times = [T['created_at'] for T in PS4tweets]
PS4texts = [T['text'] for T in PS4tweets]
PS4screen_names = [T['user']['screen_name'] for T in PS4tweets]
PS4followers_count = [T['user']['followers_count'] for T in PS4tweets]
PS4friends_count = [T['user']['friends_count'] for T in PS4tweets]
PS4names = [T['user']['name'] for T in PS4tweets]
PS4lats = [(T['geo']['coordinates'][0] if T['geo'] else None) for T in PS4tweets]
PS4lons = [(T['geo']['coordinates'][1] if T['geo'] else None) for T in PS4tweets]
PS4place_names = [(T['place']['full_name'] if T['place'] else None) for T in PS4tweets]
PS4place_types = [(T['place']['place_type'] if T['place'] else None) for T in PS4tweets]
PS4location = [status['user']['location'] for status in PS4tweets if 'user' in status]

In [None]:
#Remove urls
Xboxtexts_no_urls = []

for i in Xboxtexts:
    result = re.sub(r"http\S+", "", i)
    Xboxtexts_no_urls.append(result)

Xboxtexts_no_urls[:3]

In [None]:
# Remove urls
PS4texts_no_urls = []

for i in PS4texts:
    result = re.sub(r"http\S+", "", i)
    PS4texts_no_urls.append(result)

PS4texts_no_urls[:3]

In [None]:
# Remove Usernames
Xboxtexts_no_urls_usernames = []

for i in Xboxtexts_no_urls:
    result = re.sub(r"(@[A-Za-z0-9]+)","", i)
    Xboxtexts_no_urls_usernames.append(result)
    
Xboxtexts_no_urls_usernames[:3]

In [None]:
# Remove Usernames
PS4texts_no_urls_usernames = []

for i in PS4texts_no_urls:
    result = re.sub(r"(@[A-Za-z0-9]+)","", i)
    PS4texts_no_urls_usernames.append(result)
    
PS4texts_no_urls_usernames[:3]

In [None]:
#Remove Retweets
Xboxtexts_no_RT = []

for i in Xboxtexts_no_urls_usernames:
    result = re.sub(r"RT : ", "", i)
    Xboxtexts_no_RT.append(result)

Xboxtexts_no_RT[:3]

In [None]:
#Remove Retweets
PS4texts_no_RT = []

for i in PS4texts_no_urls_usernames:
    result = re.sub(r"RT : ", "", i)
    PS4texts_no_RT.append(result)
PS4texts_no_RT[:3]

In [None]:
# Remove "\n" from text
Xboxtexts_no_n = []

for i in Xboxtexts_no_RT:
    result = re.sub(r"\n", "", i)
    Xboxtexts_no_n.append(result)

Xboxtexts_no_n[:3]

In [None]:
#Remove "\n" from text
PS4texts_no_n = []

for i in PS4texts_no_RT:
    result = re.sub(r"\n", "", i)
    PS4texts_no_n.append(result)

PS4texts_no_n[:3]

In [None]:
#Split Text
Xboxtexts_clean_completely = []

for i in Xboxtexts_no_n:
    result = ' '.join(re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", i).split())
    Xboxtexts_clean_completely.append(result)

Xboxtexts_clean_completely[:3]

In [None]:
#Split Text
PS4texts_clean_completely = []

for i in PS4texts_no_n:
    result = ' '.join(re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", i).split())
    PS4texts_clean_completely.append(result)

PS4texts_clean_completely[:3]

# Section 3: Sentiment Analysis 

## Put data into DataFrame

In [None]:
#Put PS4 data into DataFrame
dfP = pd.DataFrame(PS4texts_clean_completely)

In [None]:
#Put XboxOne data into DataFrame
dfX = pd.DataFrame(Xboxtexts_clean_completely)

## Vader Sentiment

In [None]:
#Define VaderSentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vs = SentimentIntensityAnalyzer()

In [None]:
#Get vader sentiment score for XboxOne Tweets
dfX['score'] = dfX[0].apply(lambda x: vs.polarity_scores(x)['compound'])
dfX.head()

In [None]:
#Get vaderSentiment score for PS4 tweets
dfP['score'] = dfP[0].apply(lambda x: vs.polarity_scores(x)['compound'])
dfP.head()

In [None]:
#Define each type of review
positive_reviewXbox = []
negative_reviewXbox = []
neutral_reviewXbox = []

for i in dfX['score']:
    if i > 0:
        positive_reviewXbox.append(i)
    elif i == 0:
        neutral_reviewXbox.append(i)
    else:
        negative_reviewXbox.append(i)

In [None]:
#Find number of pos.,neg., neutral reviews for Xbox tweets
print(len(positive_reviewXbox))
print(len(negative_reviewXbox))
print(len(neutral_reviewXbox))

In [None]:
#Define each type of review
positive_reviewPS4 = []
negative_reviewPS4 = []
neutral_reviewPS4 = []

for i in dfP['score']:
    if i > 0:
        positive_reviewPS4.append(i)
    elif i == 0:
        neutral_reviewPS4.append(i)
    else:
        negative_reviewPS4.append(i)

In [None]:
#Find number of pos, neg, neutral reviews for PS4
print(len(positive_reviewPS4))
print(len(negative_reviewPS4))
print(len(neutral_reviewPS4))

# Section 4: Descriptive Analytics

In [None]:
#Tokenize Xbox data
tokens = str(dfX)
tokens = tokens.lower()
tokens = tokens.split()
tokens = re.sub("[^a-zA-Z0-9]", " ", str(tokens))
tokens = word_tokenize(tokens)
tokens = (word for word in tokens if word not in stopwords.words('english'))

In [None]:
#Frequency Distribution of Xbox Data
fdist = nltk.FreqDist(tokens)
fdist

In [None]:
#Tokenize PS4 data
tokensp = str(dfP)
tokensp = tokensp.lower()
tokensp = tokensp.split()
tokensp = re.sub("[^a-zA-Z0-9]", " ", str(tokensp))
tokensp = word_tokenize(tokensp)
tokensp = (word for word in tokensp if word not in stopwords.words('english'))

In [None]:
#Frequency Distribution of PS4 Data
fdistp = nltk.FreqDist(tokensp)
fdistp

In [None]:
#Word Cloud PS4 tweets
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords

# Read the whole text.
text = str(dfP)

stopwords = set(STOPWORDS)
auto_mask = np.array(Image.open("data/Ps4.jpg"))

wc = WordCloud(background_color="black", max_words=2000, mask=auto_mask,
               stopwords=stopwords)
# generate word cloud
wc.generate(text)

# show
plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")

plt.figure()
plt.imshow(auto_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.savefig("data/Auto.png")
plt.savefig("data/Auto.pdf")

In [None]:
#Word Cloud Xbox tweets
text = str(dfX)

stopwords = set(STOPWORDS)
auto_mask = np.array(Image.open("data/Xbox1.jpg"))

wc = WordCloud(background_color="black", max_words=2000, mask=auto_mask,
               stopwords=stopwords)
# generate word cloud
wc.generate(text)

# show
plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")

plt.figure()
plt.imshow(auto_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.savefig("data/Auto.png")
plt.savefig("data/Auto.pdf")

# Section 5: Webcrawling for data to Answer Questions

In [None]:
# WebCrawl website for total sales of each game system

# import python packages
import requests
from lxml import html
import csv
import pandas as pd

r = requests.get('http://www.vgchartz.com/analysis/platform_totals/')
data = html.fromstring(r.text)

# Xpath
alldata =[]

for i in data.xpath("//tbody/tr")[:15]:
    total_sold = i.xpath('td[7]/center/text()')  
    North_America = i.xpath('td[3]/center/text()')  
    platform = i.xpath('td[2]/a/text()')
    print(platform, total_sold, North_America)
    alldata.append([platform, total_sold, North_America])

In [None]:
#Load data into a DataFrame
dfS = pd.DataFrame(alldata)
dfS.head()

In [None]:
#Remove Brackets
dfS[0]=dfS[0].str[0]
dfS[1]=dfS[1].str[0]
dfS[2]=dfS[2].str[0]
dfS.head(5)

In [None]:
#Rename Columns
dfS = dfS.rename(columns = {0: 'Platform', 1: 'Worldwide sales (in millions)', 2: 'North America sales (in millions)'})
dfS.head(5)

In [None]:
#Locate just XboxOne and PS4 data
cloud = dfS.Platform.str.contains("PlayStation 4|Xbox One")
dfS[cloud]

In [None]:
#Create Positive Twitter Reviews column
dfS['positive Twitter reviews'] = 0
dfS[cloud]

In [None]:
#Use iloc to bring number of positive twitter reviews into dataframe
dfS.iloc[5, dfS.columns.get_loc('positive Twitter reviews')] = len(positive_reviewPS4)
dfS.iloc[13, dfS.columns.get_loc('positive Twitter reviews')] = len(positive_reviewXbox)
dfS[cloud]

In [None]:
#Create Negative Twitter reviews column
dfS['negative Twitter reviews'] = 0
dfS[cloud]

In [None]:
#Use iloc to bring number of Negative twitter reviews into DataFrame
dfS.iloc[5, dfS.columns.get_loc('negative Twitter reviews')] = len(negative_reviewPS4)
dfS.iloc[13, dfS.columns.get_loc('negative Twitter reviews')] = len(negative_reviewXbox)
dfS[cloud]

In [None]:
#Create neutral twitter reviews column
dfS['neutral Twitter reviews'] = 0
dfS[cloud]

In [None]:
#Use iloc to bring number of neutral twitter reviews into DataFrame
dfS.iloc[5, dfS.columns.get_loc('neutral Twitter reviews')] = len(neutral_reviewPS4)
dfS.iloc[13, dfS.columns.get_loc('neutral Twitter reviews')] = len(neutral_reviewXbox)
dfS[cloud]

In [None]:
#Webcrawl for Engadget PS4 rating

rpe = requests.get('https://www.engadget.com/products/sony/playstation/4/')
datape = html.fromstring(rpe.text)

alldatape = []

for i in datape.xpath("//div[@class='table-cell-bottom ta-r']"):
    rating = i.xpath("div[@class='t-list-header-2 th-title']/text()")
    print(rating)
    alldatape.append([rating])
    
len(alldatape)

In [None]:
#Webcrawl for Engadget XboxOne rating

rxe = requests.get('https://www.engadget.com/products/microsoft/xbox/one/')
dataxe = html.fromstring(rxe.text)

alldataxe = []

for i in dataxe.xpath("//div[@class='table-cell-bottom ta-r']"):
    rating = i.xpath("div[@class='t-list-header-2 th-title']/text()")
    print(rating)
    alldataxe.append([rating])
    
len(alldataxe)

In [None]:
#Load PS4 data into a DataFrame
dfpe = pd.DataFrame(alldatape)
dfpe.head()

In [None]:
#Load Xbox One data into a DataFrame
dfxe = pd.DataFrame(alldataxe)
dfxe.head()

In [None]:
#Remove Brackets
dfpe[0]=dfpe[0].str[0]
dfpe

In [None]:
#Remove Brackets
dfxe[0]=dfxe[0].str[0]
dfxe

In [None]:
#Remove \n from data
dfpe = dfpe.replace('\n','', regex=True)
dfpe

In [None]:
#Remove \n from data
dfxe = dfxe.replace('\n','', regex=True)
dfxe

In [None]:
#Drop unwanted rows from PS4 dataframe
dfpe.drop(dfpe.index[[1,2,3]], inplace=True)
dfpe

In [None]:
#Make second row in Xbox dataframe equal PS4 rating
dfxe.iloc[1] = dfpe.iloc[0]
dfxe

In [None]:
#Remove unwanted rows from data
dfxe.drop(dfxe.index[[2,3]], inplace=True)
dfxe

In [None]:
#Rename Column
dfxe = dfxe.rename(columns = {0: 'Engadget Rating'})
dfxe

In [None]:
#Load Engadget Ratings into our Main DataFrame
dfS['Engadget Rating'] = dfxe['Engadget Rating']
dfS.head(3)

In [None]:
#Use iloc to have PS4 and XboxOne equal their respective Engadget Ratings
dfS.iloc[5, dfS.columns.get_loc('Engadget Rating')] = dfS.iloc[1, dfS.columns.get_loc('Engadget Rating')]
dfS.iloc[13, dfS.columns.get_loc('Engadget Rating')] = dfS.iloc[0, dfS.columns.get_loc('Engadget Rating')]
dfS[cloud]

In [None]:
#Webcrawl for CNet PS4 rating

rcp = requests.get('https://www.cnet.com/reviews/sony-playstation-4-review/')
datacp = html.fromstring(rcp.text)

alldatacp = []

for i in datacp.xpath("//div[@class='col-1 overall']"):
    rating = i.xpath('div/span/text()')
    print(rating)
    alldatacp.append([rating])

In [None]:
#Webcrawl for CNet XboxOne rating

rcx = requests.get('https://www.cnet.com/reviews/microsoft-xbox-one-review/')
datacx = html.fromstring(rcx.text)

alldatacx = []

for i in datacx.xpath("//div[@class='col-1 overall']"):
    rating = i.xpath('div/span/text()')
    print(rating)
    alldatacx.append([rating])

In [None]:
#Create a CNet Rating column in our Main DataFrame
dfS['CNet Rating'] = 0
dfS[cloud]

In [None]:
#Use iloc to bring CNet ratings into our main DataFrame
dfS.iloc[5, dfS.columns.get_loc('CNet Rating')] = alldatacp
dfS.iloc[13, dfS.columns.get_loc('CNet Rating')] = alldatacx
dfS[cloud]

In [None]:
#Remove brackets from CNet ratings column
dfS['CNet Rating']=dfS['CNet Rating'].str[0]
dfS[cloud]