# **Project #3: Reddit NLP - Data Scraping Raidsecrets
### DestinyTheGame vs. raidsecrets
*By Daniel Preston McBride*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import time

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from bs4 import BeautifulSoup

---
### Accessing reddit data through pushshift API for Raidsecrets subreddit

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
params = {
    'subreddit': 'raidsecrets',
    'size': 100
}

In [4]:
res = requests.get(url, params)

In [5]:
res.status_code

200

In [6]:
data = res.json()

In [7]:
posts = data['data']

In [8]:
raid_reddit = pd.DataFrame(posts)

In [9]:
raid_reddit = raid_reddit[['subreddit','title','selftext','created_utc']]

In [10]:
raid_reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    100 non-null    object
 1   title        100 non-null    object
 2   selftext     100 non-null    object
 3   created_utc  100 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 3.2+ KB


In [11]:
raid_reddit.head()

Unnamed: 0,subreddit,title,selftext,created_utc
0,raidsecrets,Bunker E15 Warmind cheese?,So i was running Legendary lost sector bunker ...,1615990366
1,raidsecrets,I think the New Strike will be the longest yet.,So we know several requirements from the strik...,1615989805
2,raidsecrets,New Frostbite glitch + jump section on Hunter ...,\*Language Warning\*\n\n[https://vimeo.com/man...,1615980660
3,raidsecrets,Conqueror hidden triumph for gild?,Anyone know what it is? Can't find any informa...,1615967416
4,raidsecrets,Full 2 Man GoS Possible ?,Recently with the introduction of stasis cryst...,1615952955


In [12]:
raid_reddit.shape[0]

100

---
### Created while loop to pull 100 posts every 5 seconds from the subreddit.  Accessed next older posts by utilizing the `before` hyperparameter on the `created_utc` timestamp column.

In [13]:
while raid_reddit.shape[0] < 2000:
    time.sleep(5)

    params = {
        'subreddit': 'raidsecrets',
        'size': 100,
        'before': raid_reddit['created_utc'].values[-1]
    }

    res = requests.get(url, params)
    data = res.json()
    posts = data['data']
    df = pd.DataFrame(posts)
    df = df[['subreddit','title','selftext','created_utc']]
    raid_reddit = pd.concat([raid_reddit, df])

In [14]:
raid_reddit.shape

(2000, 4)

---
### Checked for duplicate rows

In [15]:
raid_reddit[raid_reddit.duplicated()]

Unnamed: 0,subreddit,title,selftext,created_utc


---
### Created new column with `created_utc` column converted to datetime to better understand the date/time difference between each post

> *Referenced converting utc to datetime from: https://www.kite.com/python/answers/how-to-convert-epoch-time-to-datetime-in-python*

In [16]:
raid_reddit['utc_to_datetime'] = [datetime.datetime.fromtimestamp(utc) for utc in raid_reddit['created_utc']]
raid_reddit['utc_to_datetime']

0    2021-03-17 10:12:46
1    2021-03-17 10:03:25
2    2021-03-17 07:31:00
3    2021-03-17 03:50:16
4    2021-03-16 23:49:15
             ...        
95   2020-12-08 16:12:35
96   2020-12-08 16:09:28
97   2020-12-08 16:08:28
98   2020-12-08 16:07:46
99   2020-12-08 16:06:11
Name: utc_to_datetime, Length: 2000, dtype: datetime64[ns]

In [17]:
raid_reddit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 99
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subreddit        2000 non-null   object        
 1   title            2000 non-null   object        
 2   selftext         1953 non-null   object        
 3   created_utc      2000 non-null   int64         
 4   utc_to_datetime  2000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 93.8+ KB


In [18]:
raid_reddit.head(10)

Unnamed: 0,subreddit,title,selftext,created_utc,utc_to_datetime
0,raidsecrets,Bunker E15 Warmind cheese?,So i was running Legendary lost sector bunker ...,1615990366,2021-03-17 10:12:46
1,raidsecrets,I think the New Strike will be the longest yet.,So we know several requirements from the strik...,1615989805,2021-03-17 10:03:25
2,raidsecrets,New Frostbite glitch + jump section on Hunter ...,\*Language Warning\*\n\n[https://vimeo.com/man...,1615980660,2021-03-17 07:31:00
3,raidsecrets,Conqueror hidden triumph for gild?,Anyone know what it is? Can't find any informa...,1615967416,2021-03-17 03:50:16
4,raidsecrets,Full 2 Man GoS Possible ?,Recently with the introduction of stasis cryst...,1615952955,2021-03-16 23:49:15
5,raidsecrets,Did I Find the Winnower's Blade?,If you don't want to read my thesis I have thi...,1615945143,2021-03-16 21:39:03
6,raidsecrets,Presage voices triggering further inside the s...,"Over the weeks, the first voiceline (the one w...",1615939686,2021-03-16 20:08:06
7,raidsecrets,Fast Seasonal Challenges Week 6,[removed],1615934769,2021-03-16 18:46:09
8,raidsecrets,Secret symbol near Future War Cult room or jus...,Hey guys. I was walking around in the tower be...,1615932731,2021-03-16 18:12:11
9,raidsecrets,New Infinite Frostbite After Patch Glitch,[https://youtu.be/1mJbYOAUIwE](https://youtu.b...,1615925671,2021-03-16 16:14:31


In [19]:
raid_reddit.to_csv('../data/raid_reddit.csv', index=False)