# **Project #3: Reddit NLP - Data Scraping DestinyTheGame
### DestinyTheGame vs. raidsecrets
*By Daniel Preston McBride*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import time

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from bs4 import BeautifulSoup

---
### Accessing reddit data through pushshift API for DestinyTheGame subreddit

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
params = {
    'subreddit': 'destinythegame',
    'size': 100
}

In [4]:
res = requests.get(url, params)

In [5]:
res.status_code

200

In [6]:
data = res.json()

In [7]:
posts = data['data']

In [8]:
dest_reddit = pd.DataFrame(posts)

In [9]:
dest_reddit = dest_reddit[['subreddit','title','selftext','created_utc']]

In [10]:
dest_reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    100 non-null    object
 1   title        100 non-null    object
 2   selftext     100 non-null    object
 3   created_utc  100 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 3.2+ KB


In [11]:
dest_reddit.head()

Unnamed: 0,subreddit,title,selftext,created_utc
0,DestinyTheGame,The infinite frostbite glitch in DSC has been ...,I hope you a[l](https://www.youtube.com/watch?...,1615996647
1,DestinyTheGame,Beyond Light Deluxe Edition weapon ornament fo...,My ornament wont unlock even though I have the...,1615996597
2,DestinyTheGame,Add the Class-Specific Swords as Rewards for G...,[removed],1615996546
3,DestinyTheGame,The Collector's Edition Sword is Still Sunset.,"Title. This is a problem, because it's a sword...",1615996218
4,DestinyTheGame,The Darkness or Savathûn might be influencing ...,After seeing the arguement at the Helm this re...,1615996158


In [12]:
dest_reddit.shape[0]

100

---
### Created while loop to pull 100 posts every 5 seconds from the subreddit.  Accessed next older posts by utilizing the `before` hyperparameter on the `created_utc` timestamp column.

In [13]:
while dest_reddit.shape[0] < 2000:
    time.sleep(5)

    params = {
        'subreddit': 'destinythegame',
        'size': 100,
        'before': dest_reddit['created_utc'].values[-1]
    }

    res = requests.get(url, params)
    data = res.json()
    posts = data['data']
    df = pd.DataFrame(posts)
    df = df[['subreddit','title','selftext','created_utc']]
    dest_reddit = pd.concat([dest_reddit, df])

In [14]:
dest_reddit.shape

(2000, 4)

---
### Checked for duplicate rows

In [15]:
dest_reddit[dest_reddit.duplicated()]

Unnamed: 0,subreddit,title,selftext,created_utc


---
### Created new column with `created_utc` column converted to datetime to better understand the date/time difference between each post

> *Referenced converting utc to datetime from: https://www.kite.com/python/answers/how-to-convert-epoch-time-to-datetime-in-python*

In [16]:
dest_reddit['utc_to_datetime'] = [datetime.datetime.fromtimestamp(utc) for utc in dest_reddit['created_utc']]
dest_reddit['utc_to_datetime']

0    2021-03-17 11:57:27
1    2021-03-17 11:56:37
2    2021-03-17 11:55:46
3    2021-03-17 11:50:18
4    2021-03-17 11:49:18
             ...        
95   2021-03-13 21:05:04
96   2021-03-13 20:58:13
97   2021-03-13 20:57:40
98   2021-03-13 20:56:18
99   2021-03-13 20:55:06
Name: utc_to_datetime, Length: 2000, dtype: datetime64[ns]

In [17]:
dest_reddit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 99
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subreddit        2000 non-null   object        
 1   title            2000 non-null   object        
 2   selftext         1999 non-null   object        
 3   created_utc      2000 non-null   int64         
 4   utc_to_datetime  2000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 93.8+ KB


In [18]:
dest_reddit.head(10)

Unnamed: 0,subreddit,title,selftext,created_utc,utc_to_datetime
0,DestinyTheGame,The infinite frostbite glitch in DSC has been ...,I hope you a[l](https://www.youtube.com/watch?...,1615996647,2021-03-17 11:57:27
1,DestinyTheGame,Beyond Light Deluxe Edition weapon ornament fo...,My ornament wont unlock even though I have the...,1615996597,2021-03-17 11:56:37
2,DestinyTheGame,Add the Class-Specific Swords as Rewards for G...,[removed],1615996546,2021-03-17 11:55:46
3,DestinyTheGame,The Collector's Edition Sword is Still Sunset.,"Title. This is a problem, because it's a sword...",1615996218,2021-03-17 11:50:18
4,DestinyTheGame,The Darkness or Savathûn might be influencing ...,After seeing the arguement at the Helm this re...,1615996158,2021-03-17 11:49:18
5,DestinyTheGame,What would you rank hand cannons for the crucible,"My hand cannon ranking\n\nA. Steady hand, true...",1615996004,2021-03-17 11:46:44
6,DestinyTheGame,Constant Weasel,Since last week I have been getting weasel err...,1615995891,2021-03-17 11:44:51
7,DestinyTheGame,This game feels impenetrable for me as a new p...,After slowly making my way through a huge back...,1615995855,2021-03-17 11:44:15
8,DestinyTheGame,GM's being gated at 1325 power is pointless an...,Before we start lets get some things out of th...,1615995817,2021-03-17 11:43:37
9,DestinyTheGame,Aphelion's Rest ascendant challenge still no a...,title,1615995359,2021-03-17 11:35:59


In [20]:
dest_reddit.to_csv('../data/dest_reddit.csv', index=False)