# Project_3 Xbox One vs PS4

# Problem Statement

Using Reddit's API to collect posts from XBox One & PS4 subreddits, then use NLP to train a classifier to differentiate which subreddit the post came from. 

Microsoft can use the classifier to efficiently filter posts from other social media platform for further analysis.



## Executive Summary:

### 1. Webscraping

### 2. Import libraries and load data

### 3. Data cleaning & Exploratory Data Analysis

### 4. Modeling
### 4.1 Logistic Regression Model
### 4.2 Naive Bayes model
### 4.3Logistic Regression Model with Pipeline
### 4.4MultinomialNB Model with Pipeline

### 5. Conclusion and Recommendations

### 1.Webscraping

In [1]:
#import library for webscraping
import requests
import time
import pandas as pd

In [2]:
#url for webscraping
xboxone_url = 'https://www.reddit.com/r/xboxone.json'
ps4_url = 'https://www.reddit.com/r/PS4.json'

In [3]:
#Define header
headers = {'user-agent': 'Chope 0.1'}

In [4]:
#Make a GET request to url
res_xboxone = requests.get(xboxone_url, headers=headers)

In [5]:
#Get status code
res_xboxone.status_code

200

In [6]:
#Assign json
the_json_xboxone = res_xboxone.json()

In [7]:
#Sorting
sorted(the_json_xboxone.keys())

['data', 'kind']

In [8]:
#Sorting
sorted(the_json_xboxone['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [9]:
#scrape first thread
the_json_xboxone['data']['children'][1]['data']

{'approved_at_utc': None,
 'subreddit': 'xboxone',
 'selftext': '',
 'author_fullname': 't2_fqykj',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'E3 2020 Officially Canceled',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/xboxone',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': 78,
 'hide_score': False,
 'name': 't3_fgyxrc',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'author_flair_background_color': None,
 'subreddit_type': 'public',
 'ups': 8411,
 'total_awards_received': 1,
 'media_embed': {},
 'thumbnail_width': 140,
 'author_flair_template_id': None,
 'is_original_content': False,
 'user_reports': [],
 'secure_media': None,
 'is_reddit_media_domain': False,
 'is_meta': False,
 'category': None,
 'secure_media_embed': {},
 'link_flair_text': None,
 'can_mod_post': False,
 'score': 8411,
 'approved_by': None,
 'author_premium': False,
 'thumbnail': 'https://b.thumbs.re

In [10]:
#get id of first thread
the_json_xboxone['data']['after']

't3_fgs2xx'

In [11]:
#get more thread id
[post['data']['name'] for post in the_json_xboxone['data']['children']]

['t3_fgxqjj',
 't3_fgyxrc',
 't3_fgzcd6',
 't3_fh7l6c',
 't3_fh712y',
 't3_fh4ujs',
 't3_fh90gy',
 't3_fgyih6',
 't3_fgvkfl',
 't3_fh9tmw',
 't3_fh1zg2',
 't3_fgz23s',
 't3_fh9khe',
 't3_fh7x5y',
 't3_fgypsf',
 't3_fh2oal',
 't3_fha0mg',
 't3_fhahkw',
 't3_fh3tyb',
 't3_fgwq3s',
 't3_fh4qyj',
 't3_fh1sg6',
 't3_fgz6c7',
 't3_fgcgpr',
 't3_fgxs5t',
 't3_fgs2xx']

In [12]:
#set param
param = {'after': 't3_fffrwt'}

In [None]:
#assign param
requests.get(xboxone_url, params = param, headers=headers)

In [None]:
#establish webscrape loop
xboxone_posts = []
after = None
for i in range(40):
    print(i)
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/xboxone.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        the_json_xboxone = res.json()
        xboxone_posts.extend(the_json_xboxone['data']['children'])
        after = the_json_xboxone['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(1)

In [None]:
#check length
len(set([p['data']['name']for p in xboxone_posts]))

In [None]:
#assign dataframe
xboxone = pd.DataFrame(xboxone_posts)

In [None]:
#save as csv file
pd.DataFrame(xboxone_posts).to_csv('xboxone.csv')

In [None]:
#check dataframe
xboxone

In [None]:
#Make a GET request to url
res_ps4 = requests.get(ps4_url, headers=headers)

In [None]:
#Get status code
res_ps4.status_code

In [None]:
#Assign json
the_json_ps4 = res_ps4.json()

In [None]:
#Sorting
sorted(the_json_ps4.keys())

In [None]:
#Sorting
sorted(the_json_ps4['data'].keys())

In [None]:
#scrape first thread
the_json_ps4['data']['children'][1]['data']

In [None]:
#get id of first thread
the_json_ps4['data']['after']

In [None]:
#get more thread id
[post['data']['name'] for post in the_json_ps4['data']['children']]

In [None]:
#set param
param = {'after': 'ffhs9o'}

In [None]:
#assign param
requests.get(ps4_url, params = param, headers=headers)

In [None]:
#establish webscrape loop
posts_ps4 = []
after = None
for i in range(40):
    print(i)
    if after == None:
        params = {}
    else:
        params = {'after' : after}
    url = 'https://www.reddit.com/r/PS4.json'
    res = requests.get(url, params = params, headers = headers)
    if res.status_code == 200:
        the_json = res.json()
        posts_ps4.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else :
        print(res.status_code)
        break
    time.sleep(1)

In [None]:
#check length
len(set([p['data']['name']for p in posts_ps4]))

In [None]:
#assign dataframe
ps4 = pd.DataFrame(posts_ps4)

In [None]:
#save as csv file
pd.DataFrame(posts_ps4).to_csv('ps4.csv')

In [None]:
#check dataframe
ps4.describe