# Script to Extract Reddit Posts

In [5]:
#Pip Install
!pip install asyncpraw

Collecting asyncpraw
  Downloading asyncpraw-7.8.0-py3-none-any.whl.metadata (9.0 kB)
Collecting aiofiles (from asyncpraw)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting aiosqlite<=0.17.0 (from asyncpraw)
  Downloading aiosqlite-0.17.0-py3-none-any.whl.metadata (4.1 kB)
Collecting asyncprawcore<3,>=2.1 (from asyncpraw)
  Downloading asyncprawcore-2.4.0-py3-none-any.whl.metadata (5.5 kB)
Collecting update_checker>=0.18 (from asyncpraw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading asyncpraw-7.8.0-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiosqlite-0.17.0-py3-none-any.whl (15 kB)
Downloading asyncprawcore-2.4.0-py3-none-any.whl (19 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiosqlite, aiofiles, update

In [6]:
!pip install --upgrade openai



In [7]:
#Import Necessary Libraries
import asyncpraw
import pandas as pd
import datetime as dt
import requests
from textblob import TextBlob
import nest_asyncio
import asyncio
import re
import spacy
import openai
from pydantic import BaseModel
from typing import Literal

In [8]:
#make sure openai version is at least 1.55
print(openai.__version__)

1.58.1


In [None]:
#build instance
reddit_read_only = asyncpraw.Reddit(client_id="CLIENT_ID",
                               client_secret="CLIENT_SECRET",
                               user_agent="USER_AGENT")

In [None]:
#make sure it can run in any environment
nest_asyncio.apply()

#get top n reddit posts
async def subreddit_top_n_posts(subreddit_name, n=150, time_frame="month"):
    subreddit = await reddit_read_only.subreddit(subreddit_name)
    posts = subreddit.top(time_frame, limit=n)

    posts_dict = {
        "Title": [],
        "Post Text": [],
        "ID": [],
        "Score": [],
        "Total Comments": [],
        "Post URL": []
    }

    count = 0
    async for post in posts:
        posts_dict["Title"].append(post.title)
        posts_dict["Post Text"].append(post.selftext)
        posts_dict["ID"].append(post.id)
        posts_dict["Score"].append(post.score)
        posts_dict["Total Comments"].append(post.num_comments)
        posts_dict["Post URL"].append(post.url)

        count += 1
        if count >= n:
            break

    top_posts = pd.DataFrame(posts_dict)
    return top_posts

#extract comments from a post link
async def get_comments_from_post(post_url):
    submission = await reddit_read_only.submission(url=post_url)
    await submission.load()

    post_comments = []
    for comment in submission.comments:
        if isinstance(comment, asyncpraw.models.MoreComments):
            continue
        post_comments.append(comment.body)

    comment_df = pd.DataFrame({'comment': post_comments,
                               'Post ID': submission.id})
    return comment_df

#get the comments from all posts in a list of post links
async def comments_from_all_posts(post_url_list):
    comments_df = pd.DataFrame()
    non_functional_links = []

    for post_url in post_url_list:
        try:
            post_df = await get_comments_from_post(post_url)
            comments_df = pd.concat([comments_df, post_df], ignore_index=True)
        except Exception as e:
            #links from which comments cannot be extracted
            non_functional_links.append(post_url)

    return comments_df, non_functional_links

#asynchronous execution function
async def main(subreddit_name, n=150, time_frame="month"):
    subreddit_name = subreddit_name
    top_posts = await subreddit_top_n_posts(subreddit_name, n, time_frame)
    post_urls = top_posts["Post URL"].tolist()
    comments_df, non_functional_links = await comments_from_all_posts(
        post_urls)

    return top_posts, comments_df, non_functional_links

In [None]:
#Preprocessing For Sentiment Analysis

#tokenize words
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text)

#remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word.lower() not in stop_words]

#lemmatize words
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

#string back together words
def one_string(text):
    return ' '.join(text)

#preprocessing function
def preprocess_text(text):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    #tokens = lemmatize(tokens)
    return one_string(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#Sentiment Analysis
def sentiment_analysis_textblob(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def sentiment_scorer(score):
    if score > 0.2:
        return 'positive'
    elif score < -0.2:
        return 'negative'
    else:
        return 'neutral'

def do_sentiment_analysis(text):
    score = sentiment_analysis_textblob(text)
    sentiment = sentiment_scorer(score)
    return sentiment

"def final_df_creator(top_posts, comments):\n    final_df = merge_posts_comments(top_posts, comments)\n    final_df['sentiment'] = final_df['comment'].apply(sentiment_analysis_textblob)\n    final_df['sentiment'] = final_df['sentiment'].apply(sentiment_scorer)\n    final_df['sentiment_post'] = final_df['Post Text'].apply(sentiment_analysis_textblob)\n    final_df['sentiment_post'] = final_df['sentiment_post'].apply(sentiment_scorer)\n\n    return final_df"

In [None]:
#load the NER model
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'lemmatizer'])

def extract_place_names(text):
    doc = nlp(text)
    places = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC', 'FAC', 'ORG']]
    return places

In [None]:
def processing(top_posts,comments):
    #preprocess for each text column
    top_posts['Title_SentPre'] = top_posts['Title'].apply(preprocess_text)
    top_posts['Post_SentPre'] = top_posts['Post Text'].apply(preprocess_text)
    comments['comment_SentPre'] = comments['comment'].apply(preprocess_text)

    #sentiment analysis for each text column
    top_posts['Title_sentiment'] = top_posts['Title_SentPre'].apply(do_sentiment_analysis)
    top_posts['Post_sentiment'] = top_posts['Post_SentPre'].apply(do_sentiment_analysis)
    comments['comment_sentiment'] = comments['comment_SentPre'].apply(do_sentiment_analysis)

    #filter out only positive rows
    top_posts = top_posts[(top_posts['Title_sentiment'] == 'positive') | (top_posts['Post_sentiment'] == 'positive')]
    comments = comments[comments['comment_sentiment'] == 'positive']

    #extract placenames
    places_titles = top_posts['Title'].apply(extract_place_names)
    places_posts = top_posts['Post Text'].apply(extract_place_names)
    places_comments = comments['comment'].apply(extract_place_names)

    #extract all items to be single items
    places_titles = places_titles.explode().reset_index(drop=True)
    places_posts = places_posts.explode().reset_index(drop=True)
    places_comments = places_comments.explode().reset_index(drop=True)

    #merge the three lists
    places  = pd.concat([places_titles, places_posts, places_comments]).reset_index(drop=True)
    places = set(places)
    places = {x for x in places if x==x and x is not None}
    return places



In [None]:
def contains_emoticon(s):
    emoticon_pattern = re.compile(r'[:;=X][\-~]?[\)D\(\]/\\OpP]')
    return bool(emoticon_pattern.search(s))

def remove_emoticon(lst):
    return [x for x in lst if not contains_emoticon(x)]

In [None]:
#import csv file from local machine for nyc areas
from google.colab import files
uploaded = files.upload()


Saving nyc-neighbourhood-data.geojson to nyc-neighbourhood-data.geojson


In [None]:
#extract data from .geojson file into a dataframe
import json

#extract all NYC neighborhoods and boroughs from file
with open('nyc-neighbourhood-data.geojson') as f:
    data = json.load(f)

neighborhoods = []
boroughs = []
for feature in data['features']:
    properties = feature['properties']
    neighborhoods.append(properties['neighborhood'])
    boroughs.append(properties['borough'])

nyc_areas = set(neighborhoods + boroughs)
nyc_areas = {x.lower() for x in nyc_areas}

#extract "neighborhood" and "borough" from data
def remove_nyc_areas(lst):
    return [x for x in lst if x.lower() not in nyc_areas]

In [None]:
#make sure it can run in any environment
nest_asyncio.apply()

async def places_from_subreddit(subreddit, n=150, time_frame="month"):
  top_posts, comments, non_functional_links = await main(subreddit, n, time_frame)
  places = processing(top_posts,comments)
  return places

In [None]:
def places_no_nonsense(places):
    places = remove_nyc_areas(places)
    places = remove_emoticon(places)
    return places

In [None]:
def final_places(subreddit):
  places = places_from_subreddit(subreddit)
  places = places_no_nonsense(places)
  return places

In [None]:
async def final_places_multiple_subreddits(subreddit_list, n=150, time_frame="month"):
  final_places = []
  subreddit_number = 0
  for subreddit in subreddit_list:
    places = await places_from_subreddit(subreddit, n, time_frame)
    places = places_no_nonsense(places)
    final_places.append(places)
    subreddit_number += 1
    print(f"{subreddit_number} out of {len(subreddit_list)} subreddits done")
  final_places = pd.Series(final_places).explode().reset_index(drop=True)
  return set(final_places)

In [None]:
subreddits = ['FoodNYC','AskNYC','bronx','Queens','Brooklyn','manhattan','statenisland','nyc']

final_places = asyncio.run(final_places_multiple_subreddits(subreddits, 300, "year"))

1 out of 8 subreddits done
2 out of 8 subreddits done
3 out of 8 subreddits done
4 out of 8 subreddits done
5 out of 8 subreddits done
6 out of 8 subreddits done




7 out of 8 subreddits done
8 out of 8 subreddits done


In [1]:
print(len(final_places))

NameError: name 'final_places' is not defined

In [None]:
print(final_places.iloc[:,1])

Unnamed: 0,0
0,L'Industrie
1,Probably
2,Vito's
3,HBO
4,23th Street
...,...
4331,"Queens with Sal, Kris"
4332,LA Fitness
4333,Minnesotan
4334,VA


In [None]:
#save progress
pd.DataFrame(final_places).to_csv('final_places.csv')

In [None]:
#use chatgpt api to filter list - fill code
#filter out
#chain restaurants
#places that are not places
#places in New York


In [None]:
final_places = final_places.iloc[:,1].to_list()

In [None]:
print(len(list(final_places.iloc[:,1])))

4336

In [None]:
api_key = 'openai_api_key'
client = openai.OpenAI(api_key=api_key)

In [None]:
def is_location_in_city(place_list,city):
  class Places(BaseModel):
    name: str
    real: bool
    #in_city: bool
    district: bool
    street: bool
    type: Literal['Athletics', 'Bar', 'Café', 'Cultural Immersion', 'Event', 'Museum', 'Nature', 'Restaurant']

  class PlacesResponse(BaseModel):
    choices: list[Places]

  completion = client.beta.chat.completions.parse(
      model="gpt-4o-2024-08-06",
      max_completion_tokens = 16000,
      messages=[
          {"role": "system", "content": ("You are a data analyst, who scraped data from Reddit to find the best places to be in a city.\n"
          "You take every place instance and you verify with online sources if it is a place that actually exists.\n"
          "Keep in mind that you haver a list of names and the following is your task"
          "You answer four questions\n"
          "1. Is this a place that actually exists or nonsense?\n"
          #"2. If it’s real, is it located in the given city?\n"
          "2. Is the place just a street name or a district within the city it id in (e.g. 23rd Stree or DeKalb avenue in NYC)"
          "For number 2, please do not mark district == True if the place is a park or a monument (e.g. Battery Park in NYC)"
          "3. Is the place a street name?"
          "4. What category of place is it?"
                                         )},
          {"role": "user", "content": f"Are the following {place_list} in {city}"}
      ],
      response_format=PlacesResponse,
  )

  filtered_places = completion.choices[0].message.parsed
  return filtered_places

In [None]:
def batch_location_filtering(places_list,city, batch_size=100):
  n = batch_size
  #divide list into m lists of length n
  all_filtered_places = []
  places_list_list = [places_list[j:j+n] for j in range(0, len(places_list), n)]
  count = 0
  for i in places_list_list:
    count += 1
    filtered_places = is_location_in_city(i,city)
    print(f'Round #{count} out of {len([[places_list_list]])} is done.')
    print(filtered_places.choices)
    all_filtered_places.append(filtered_places.choices)
  #all_filtered_places = [item for sublist in all_filtered_places for item in sublist]
  return all_filtered_places

In [None]:
filtered_places = batch_location_filtering(list(final_places.iloc[:,1]), 'New York City')

Round #1 out of 1 is done.
[Places(name="L'Industrie", real=True, district=False, street=False, type='Restaurant'), Places(name='Probably', real=False, district=False, street=False, type='Cultural Immersion'), Places(name="Vito's", real=True, district=False, street=False, type='Restaurant'), Places(name='HBO', real=True, district=False, street=False, type='Cultural Immersion'), Places(name='23th Street', real=True, district=False, street=True, type='Cultural Immersion'), Places(name='North India', real=False, district=False, street=False, type='Cultural Immersion'), Places(name='Brooklyn Promenade', real=True, district=False, street=False, type='Nature'), Places(name='Cross Island', real=True, district=False, street=False, type='Cultural Immersion'), Places(name='NYC Reservation Difficulties', real=False, district=False, street=False, type='Cultural Immersion'), Places(name='Botanical Gardens', real=True, district=False, street=False, type='Nature'), Places(name='Arthur Ave Recommendat

In [None]:
#check length of filtered list
print(len([i.name for i in filtered_places_explode if (i.real == True) & (i.district == False) & (i.street == False)]))

1567

In [None]:
city_places = [i for i in filtered_places_explode if (i.real == True) & (i.district == False) & (i.street == False)]

In [None]:
#make city_places a dataframe
city_places = pd.DataFrame(city_places)

In [None]:
city_places.columns = ['name', 'real', 'district', 'street', 'category']
for c in city_places.columns:
  city_places[c] = city_places[c].apply(lambda x: x[1])
city_places.head()

Unnamed: 0,name,real,district,street,category
0,L'Industrie,True,False,False,Restaurant
1,Vito's,True,False,False,Restaurant
2,HBO,True,False,False,Cultural Immersion
3,Brooklyn Promenade,True,False,False,Nature
4,Cross Island,True,False,False,Cultural Immersion


In [None]:
#remove duplicates based on name
city_places = city_places.drop_duplicates(subset=['name'])

In [None]:
#sum city_places by categroy
city_places.groupby('category').count()

Unnamed: 0_level_0,name,real,district,street
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Athletics,20,20,20,20
Bar,85,85,85,85
Café,69,69,69,69
Cultural Immersion,579,579,579,579
Event,33,33,33,33
Museum,62,62,62,62
Nature,176,176,176,176
Restaurant,539,539,539,539


In [None]:
#Filter Out Fast Food Restaurants
fast_food_restaurants = [
    "McDonald’s",
    "Mickey D’s",
    "McD",
    "McDonalds",
    "Golden Arches",
    "Burger King",
    "BK",
    "King of Burgers",
    "Subway",
    "Subway Sandwiches",
    "Eat Fresh",
    "Taco Bell",
    "Taco Hell",
    "T-Bell",
    "Wendy’s",
    "Wendys",
    "Redhead",
    "KFC",
    "K-Fried",
    "Kentucky Chicken",
    "Colonel’s Chicken",
    "Chick-fil-A",
    "Chickfila",
    "Chick Fil A",
    "The Chicken Sandwich Place",
    "Domino’s Pizza",
    "Domino's",
    "Dominos Pizza",
    "Domino’s",
    "Pizza Hut",
    "The Hut",
    "PizzaHut",
    "Starbucks",
    "Bucks",
    "SBUX",
    "Starbucks Coffee",
    "Dunkin’",
    "Dunkin",
    "Dunkies",
    "DD",
    "Popeyes",
    "Popeyes Louisiana Kitchen",
    "Popeye’s Chicken",
    "Popeyes Chicken and Biscuits",
    "Sonic Drive-In",
    "Sonic",
    "Sonic Burger",
    "Arby’s",
    "Arbys",
    "Roast Beef Place",
    "Five Guys",
    "Five Guys Burgers and Fries",
    "FiveGuys",
    "In-N-Out Burger",
    "In-N-Out",
    "In and Out",
    "In & Out",
    "Jack in the Box",
    "Jack’s",
    "JITB",
    "Carl’s Jr.",
    "Hardee’s",
    "Carls Jr",
    "Little Caesars",
    "Little Caesar's",
    "Pizza! Pizza!",
    "Chipotle Mexican Grill",
    "Chipotle",
    "Chip"
]

city_places = city_places[~city_places['name'].isin(fast_food_restaurants)]

In [None]:
#export data
city_places.iloc[:,[0,4]].to_csv('city_places.csv')

## [To Be Implemented Later] NLP for better sentiment analysis to replace textblob

In [None]:
#tokenize words
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text)

#remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word.lower() not in stop_words]

#lemmatize words
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

#string back together words
def one_string(text):
    return ' '.join(text)

#cut off end of string if more than 510 tokens
def cut_string(text):
    if len(text) > 510:
        return text[:510]
    else:
        return text

#preprocessing function
def preprocess_text(text):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    tokens = one_string(tokens)
    return = cut_string(tokens)


In [None]:
final_df['comment'] = final_df['comment'].apply(tokenize)
final_df['comment'] = final_df['comment'].apply(remove_stopwords)
final_df['comment'] = final_df['comment'].apply(lemmatize)
final_df['comment'] = final_df['comment'].apply(one_string)
final_df['comment'] = final_df['comment'].apply(cut_string)

final_df['Post Text'] = final_df['Post Text'].apply(tokenize)
final_df['Post Text'] = final_df['Post Text'].apply(remove_stopwords)
final_df['Post Text'] = final_df['Post Text'].apply(lemmatize)
final_df['Post Text'] = final_df['Post Text'].apply(one_string)
final_df['Post Text'] = final_df['Post Text'].apply(cut_string)

In [None]:
from transformers import pipeline
from joblib import Parallel, delayed

# Initialize the Hugging Face pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Function to process sentiment analysis in batches
def analyze_sentiment_batch(texts):
    # Use batch processing with the Hugging Face pipeline
    results = sentiment_pipeline(texts, batch_size=8)
    return [result['label'] for result in results]

# Function to split data into chunks
def chunkify(data, chunk_size):
    return [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

# Parallelized processing function
def process_sentiment_parallel(data_column, chunk_size=100):
    # Split the data into chunks
    chunks = chunkify(data_column.tolist(), chunk_size)

    # Process chunks in parallel
    results = Parallel(n_jobs=-1)(
        delayed(analyze_sentiment_batch)(chunk) for chunk in chunks
    )

    # Flatten results from all chunks
    return [item for sublist in results for item in sublist]

# Example DataFrame
data = {
    "comment": [
        "I love this product!",
        "This is the worst experience ever.",
        "Python is amazing.",
        "I feel neutral about this.",
        "Absolutely terrible service!",
    ],
    "Post Text": [
        "The new update is fantastic.",
        "Why is the service so bad?",
        "Great features in the latest release.",
        "This is okay but could be better.",
        "Awful, I won't use this again."
    ]
}
#final_df = pd.DataFrame(data)

# Process sentiment for 'comment' and 'Post Text' columns
final_df['sentiment'] = process_sentiment_parallel(final_df['comment'], chunk_size=100)
final_df['post_sentiment'] = process_sentiment_parallel(final_df['Post Text'].unique(), chunk_size=100)

final_df.head(20)


RuntimeError: The size of tensor a (578) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
x=final_df.loc[:,"comment"]

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis",
                              model="distilbert-base-uncased-finetuned-sst-2-english")
def get_sentiment(text):
    return sentiment_pipeline(text,batch_size=8)[0]['label']

y = [get_sentiment(i) for i in x]

In [None]:
[type(i) for i in final_df['comment'] if not isinstance(i,str)]

[]

In [None]:
set([type(i) for i in final_df['comment']])

{str}

In [None]:
#Sentiment Analysis
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis",
                              model="distilbert-base-uncased-finetuned-sst-2-english")
def get_sentiment(text):
    return sentiment_pipeline(text,batch_size=8)[0]['label']

from joblib import Parallel, delayed

#results_sentiment = Parallel(n_jobs=-1)(delayed(analyze_sentiment)(text) for text in data)
final_df['sentiment'] = Parallel(n_jobs=-1)(delayed(get_sentiment)(text) for text in final_df['comment'])
final_df['post_sentiment'] = Parallel(n_jobs=-1)(delayed(get_sentiment)(text) for text in final_df['Post Text'].unique())
"""def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity"""

#final_df['sentiment'] = final_df['comment'].apply(get_sentiment)
#final_df['post_sentiment'] = final_df['Post Text'].apply(get_sentiment)

KeyboardInterrupt: 

In [None]:
final_df

Unnamed: 0,Title,Post Text,ID,Comment_Number,comment,sentiment,post_sentiment
0,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,comment 1,I love how the owner had RECEIPTS. Such a deta...,0.053636,0.3375
1,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,comment 2,"This is the kind of energy we need. Proof, rec...",1.000000,0.3375
2,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,comment 3,Damn for the final course he cooked and served...,0.000000,0.3375
3,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,comment 4,"Just to add some additional context, Ikigai op...",0.170000,0.3375
4,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,comment 5,Thanks! That review ruined my whole day. It's...,0.296667,0.3375
...,...,...,...,...,...,...,...
3245,Looking for the best bars with weird/unique/go...,Looking for cocktail bars that have unique men...,1gu9rx7,comment 15,Apothoke chinatown or flatiron,0.000000,0.3750
3246,Looking for the best bars with weird/unique/go...,Looking for cocktail bars that have unique men...,1gu9rx7,comment 16,Chinato,0.000000,0.3750
3247,Looking for the best bars with weird/unique/go...,Looking for cocktail bars that have unique men...,1gu9rx7,comment 17,- Apotheke Chinatown\n- Mace\n- the hidden pea...,-0.166667,0.3750
3248,Hot chocolate,Hey so grew up here and make it my yearly miss...,1gsm9u5,comment 1,YES. The cookies are amazing as well.,0.600000,0.3625


In [None]:
non_functional_links

['https://i.redd.it/4oiy91x13swd1.jpeg',
 'https://i.redd.it/w5s5p1lbb5yd1.jpeg',
 'https://i.redd.it/zsebu9qhde0e1.jpeg',
 'https://i.redd.it/f81mx1zbdfyd1.jpeg',
 'https://i.redd.it/yeudwlpuha2e1.jpeg',
 'https://i.redd.it/r19oejljw52e1.jpeg',
 'https://www.nytimes.com/article/best-nyc-pizza-pizzeria.html',
 'https://v.redd.it/wt8o5mlvw21e1',
 'https://www.theinfatuation.com/new-york/guides/best-new-dishes-nyc-2024?utm_campaign=reddit&utm_medium=affiliate&utm_source=reddit',
 'https://ny.eater.com/2024/10/30/24282656/l-l-hawaiian-barbecue-opening-new-york-manhattan',
 'https://i.redd.it/ggliy32kvozd1.jpeg',
 'https://i.redd.it/xziu7khcqpzd1.jpeg',
 'https://i.redd.it/y9x06u1t1b1e1.jpeg',
 'https://i.redd.it/96bbku1dlhzd1.jpeg',
 'https://i.redd.it/gkad7vel4d2e1.jpeg',
 'https://i.redd.it/24cz6vmru21e1.jpeg',
 'https://i.redd.it/vsxkss51va2e1.jpeg',
 'https://i.redd.it/fzrt00ku7jzd1.jpeg']

In [None]:
comments_df

Unnamed: 0,comment,Post ID
0,I love how the owner had RECEIPTS. Such a deta...,1gp90f0
1,"This is the kind of energy we need. Proof, rec...",1gp90f0
2,Damn for the final course he cooked and served...,1gp90f0
3,"Just to add some additional context, Ikigai op...",1gp90f0
4,Thanks! That review ruined my whole day. It's...,1gp90f0
...,...,...
1458,IMO the best ramen in NYC is at Ramen Ishida o...,1go1kni
1459,Jun-Men Ramen Bar,1go1kni
1460,Ichiran is the only answer. That’s the one.,1go1kni
1461,Lucky cat on 53rd between 2 and 3. It’s not th...,1go1kni


In [None]:
def subreddit_top_n_posts(subreddit_name,n=1000,time_frame="month"):
    posts = reddit_read_only.subreddit(subreddit_name).top(time_frame, limit=n)
    posts_dict = {"Title": [], "Post Text": [],
              "ID": [], "Score": [],
              "Total Comments": [], "Post URL": []
              }
    for post in posts:
      posts_dict["Title"].append(post.title)
      posts_dict["Post Text"].append(post.selftext)
      posts_dict["ID"].append(post.id)
      posts_dict["Score"].append(post.score)
      posts_dict["Total Comments"].append(post.num_comments)
      posts_dict["Post URL"].append(post.url)

    top_posts = pd.DataFrame(posts_dict)
    return top_posts

import asyncio

async def get_comments_from_post(post_url):
    submission = await reddit_read_only.submission(url=post_url) # Await the coroutine
    post_comments = []
    for comment in submission.comments:
        if type(comment) == asyncpraw.models.MoreComments:
            continue
        post_comments.append(comment.body)
    comment_df = pd.DataFrame({'comment': post_comments, 'Post ID': submission.id})
    return comment_df

# When calling the async function
async def main():  # Define an async main function
    comments_df = await get_comments_from_post(top_posts.iloc[0, -1])
    print(comments_df)


"""def get_comments_from_post(post_url):
    submission = reddit_read_only.submission(url=post_url)
    post_comments = []
    for comment in submission.comments:
        if type(comment) == asyncpraw.models.MoreComments:
            continue
        post_comments.append(comment.body)
    comment_df = pd.DataFrame({'comment': post_comments, 'Post ID': submission.id})
    return comment_df"""

"""def get_comments_from_post(post_url):
    submission = reddit_read_only.submission(url=post_url)
    post_comments = []

    # Ensure comments are fully loaded
    submission.comments.replace_more(limit=0)

    for comment in submission.comments:
        post_comments.append(comment.body)

    # Create a DataFrame with comments
    comment_df = pd.DataFrame({'comment': post_comments, 'Post ID': submission.id})
    return comment_df"""

"""def comments_from_all_post(post_url_list):
    comments_df = pd.DataFrame()
    non_functional_links = []
    for post_url in post_url_list:
        try:
            print(get_comments_from_post(post_url).head)
            comments_df = pd.concat([comments_df, get_comments_from_post(post_url)])
        except:
            non_functional_links.append(post_url)
            continue
    return comments_df, non_functional_links"""
def comments_from_all_posts(post_url_list):
    comments_df = pd.DataFrame()
    non_functional_links = []

    for post_url in post_url_list:
        try:
            # Fetch and concatenate comments
            post_df = get_comments_from_post(post_url)
            comments_df = pd.concat([comments_df, post_df], ignore_index=True)
        except Exception as e:
            # Log errors for debugging
            print(f"Error with URL {post_url}: {e}")
            non_functional_links.append(post_url)
            continue

    return comments_df, non_functional_links

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity



#function to store all text in one table
#def store_all_text(df):


In [None]:
import nest_asyncio
nest_asyncio.apply()

# Run the async function using asyncio.run()
asyncio.run(main())

                                              comment  Post ID
0   I love how the owner had RECEIPTS. Such a deta...  1gp90f0
1   This is the kind of energy we need. Proof, rec...  1gp90f0
2   Damn for the final course he cooked and served...  1gp90f0
3   Just to add some additional context, Ikigai op...  1gp90f0
4   Thanks!  That review ruined my whole day. It's...  1gp90f0
..                                                ...      ...
89  And did Yelp take it down?    Yelp is awful st...  1gp90f0
90  As soon as I saw the word "resy" I knew this w...  1gp90f0
91  Great professional response laying out all the...  1gp90f0
92  How do you even know what the owner is saying ...  1gp90f0
93                                          [removed]  1gp90f0

[94 rows x 2 columns]


In [None]:
get_comments_from_post(top_posts.iloc[0,-1])

AttributeError: 'coroutine' object has no attribute 'comments'

In [None]:
comments_df, non_functional_links = comments_from_all_post(top_posts.iloc[:,-1])
comments_df

  continue


In [None]:
comments_df

In [None]:
#get posts from a subreddit
sr_name = "FoodNYC"

posts = subreddit_top_n_posts(sr_name)

#get

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Unnamed: 0,comment,Post ID
0,I love how the owner had RECEIPTS. Such a deta...,1gp90f0
1,"This is the kind of energy we need. Proof, rec...",1gp90f0
2,Damn for the final course he cooked and served...,1gp90f0
3,"Just to add some additional context, Ikigai op...",1gp90f0
4,Thanks! That review ruined my whole day. It's...,1gp90f0
...,...,...
89,And did Yelp take it down? Yelp is awful st...,1gp90f0
90,"As soon as I saw the word ""resy"" I knew this w...",1gp90f0
91,Great professional response laying out all the...,1gp90f0
92,How do you even know what the owner is saying ...,1gp90f0


In [None]:
subreddit = reddit_read_only.subreddit("FoodNYC")

# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)

# Display the title of the Subreddit
print("Title:", subreddit.title)

#extract posts
for post in subreddit.top(limit=5):
    print(post.title)
    print()

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Display Name: FoodNYC


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Title: FoodNYC
What I ate as a tourist in your beautiful city

Yelp “Critic” writes a negative review of Ikigai and the business responds with their narrative based on video footage

Quick trip to NYC. Mostly ate with friends who live there. 16 years since my last visit.

PSA that Saigon Vietnamese Sandwich Deli in little Italy (who arguably have the best banh mi in the city) are struggling to make rent post pandemic

Unpopular Opinion: We Cut Restaurants Way Too Much Slack



In [None]:
posts = subreddit.top("month")
# Scraping the top posts of the current month

posts_dict = {"Title": [], "Post Text": [],
              "ID": [], "Score": [],
              "Total Comments": [], "Post URL": []
              }

for post in posts:
    # Title of each post
    posts_dict["Title"].append(post.title)

    # Text inside a post
    posts_dict["Post Text"].append(post.selftext)

    # Unique ID of each post
    posts_dict["ID"].append(post.id)

    # The score of a post
    posts_dict["Score"].append(post.score)

    # Total number of comments inside the post
    posts_dict["Total Comments"].append(post.num_comments)

    # URL of each post
    posts_dict["Post URL"].append(post.url)

# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month")
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL
0,Yelp “Critic” writes a negative review of Ikig...,I’m impressed by the professional response fro...,1gp90f0,3128,283,https://www.reddit.com/gallery/1gp90f0
1,The only booth at the holiday markets that I n...,,1gs5h5k,790,138,https://www.reddit.com/gallery/1gs5h5k
2,Finally Did It—Opened My First Burger Joint 🍔 ...,"Hey, FoodNYC—after lurking on this page for fa...",1gbky3f,759,122,https://www.reddit.com/r/FoodNYC/comments/1gbk...
3,Looks like they are opening a Wegman’s across ...,,1gbev3m,440,73,https://i.redd.it/4oiy91x13swd1.jpeg
4,Funny story about a 2-star Michelin restaurant...,I work at a Trader Joe's that's close to a 2-s...,1grnilq,404,149,https://www.reddit.com/r/FoodNYC/comments/1grn...
...,...,...,...,...,...,...
95,Atomix - Price Increase for December,Well another month with no success in scoring ...,1ghdvn7,29,33,https://www.reddit.com/r/FoodNYC/comments/1ghd...
96,Good “Classic NYC” Restaurants for Out of Town...,Have some friends from Europe coming tomorrow ...,1gve9a9,29,52,https://www.reddit.com/r/FoodNYC/comments/1gve...
97,I reviewed Sarge's Deli after our recent trip ...,This week's Saboscrivner review is for a fanta...,1gngdc9,25,12,https://www.reddit.com/r/FoodNYC/comments/1gng...
98,Best Hot Chocolate in each borough?,"In anticipation of cooler weather, where are t...",1gp3juv,26,20,https://www.reddit.com/r/FoodNYC/comments/1gp3...


In [None]:
url = top_posts.iloc[0,-1]

# Creating a submission object
submission = reddit_read_only.submission(url=url)


post_comments = []

for comment in submission.comments:
    if type(comment) == MoreComments:
        continue

    post_comments.append(comment.body)

# creating a dataframe
comments_df = pd.DataFrame(post_comments, columns=['comment'])
comments_df

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Unnamed: 0,comment
0,I love how the owner had RECEIPTS. Such a deta...
1,"This is the kind of energy we need. Proof, rec..."
2,Damn for the final course he cooked and served...
3,"Just to add some additional context, Ikigai op..."
4,Thanks! That review ruined my whole day. It's...
...,...
89,And did Yelp take it down? Yelp is awful st...
90,"As soon as I saw the word ""resy"" I knew this w..."
91,Great professional response laying out all the...
92,How do you even know what the owner is saying ...


In [None]:
submission.id

'1gp90f0'

In [None]:
POST /api/search_subreddits

In [None]:
#sentiment analysis of comments_df
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

comments_df['sentiment'] = comments_df['comment'].apply(get_sentiment)

In [None]:
comments_df

Unnamed: 0,comment,sentiment
0,I love how the owner had RECEIPTS. Such a deta...,0.053636
1,"This is the kind of energy we need. Proof, rec...",1.000000
2,Damn for the final course he cooked and served...,0.000000
3,"Just to add some additional context, Ikigai op...",0.170000
4,Thanks! That review ruined my whole day. It's...,0.296667
...,...,...
89,And did Yelp take it down? Yelp is awful st...,-0.577778
90,"As soon as I saw the word ""resy"" I knew this w...",0.000000
91,Great professional response laying out all the...,0.150000
92,How do you even know what the owner is saying ...,-0.188095
