In [None]:
import pandas as pd
import numpy as np
import json
import time
import sys

# for cleaning text
import re
# for stopwords and tokenizing text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# reddit scraper
import praw

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# text analysis
from collections import Counter
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Initialize Reddit client
reddit = praw.Reddit(
    client_id="my id",
    client_secret="my secret",
    user_agent="my user agent"
)

# Choose subreddits
subreddits = [
    "london", "CasualUK", "ukcrime",
    "AskUK", "ukpolitics", "BritishProblems",
    "LegalAdviceUK", "policeuk", "ProtectAndServe",
    "UKHousing", "UKLegalAdvice"
]

# keywords to search for
queries = [
    "burglary", "burgled", "break-in", "breaking and entering", 
    "home invasion", "house broken into", "theft", "stolen", "robbery", "robbed", 
    "property crime", "residential theft", "home security", "door locks", "window security",
    "CCTV", "burglar alarm", "neighborhood watch"
]

In [None]:
# stores all posts
results = []
count = 0
for query in queries:
    for sub in subreddits:
        # Searches for max 500 posts for each sub-query combination
        for post in reddit.subreddit(sub).search(query, limit=500, sort="new", time_filter="all"):
            data = {
                "title": post.title,
                "selftext": post.selftext,
                "author": str(post.author),
                "score": post.score,
                "url": post.url,
                "created_utc": post.created_utc,
                "num_comments": post.num_comments,
                "id": post.id,
                "permalink": post.permalink,
                "subreddit": str(post.subreddit),
                "matched_query": query
            }
            results.append(data)
        # for reddit timeout
        time.sleep(1)
    count += 1
    print(f"Queries left: {len(queries) - count}", end='\r')
    sys.stdout.flush()

# store results in 'london_crime_posts.json'
with open("london_crime_posts.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Saved {len(results)} posts to london_crime_posts.json")

In [None]:
# words to remove that are not important to the analysis
more_stop_words = [
    'im', 'ive', 'dont', 'know', 'new', 'want', 'got', 'day', 'said', 'really',
    'like', 'time', 'london', 'uk', 'years', 'think', 'going', 'need', 'year',
    'didnt', 'good', 'feel'
]
stop_words = set(stopwords.words('english'))
stop_words.update(more_stop_words)

# clean the text such that there are only relevant words left
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

In [None]:
# get the posts and drop duplicates
posts = pd.read_json("london_crime_posts.json")
posts = posts.drop_duplicates(subset="id", keep="first").reset_index(drop=True)
# append the body of the post to the title and clean it
posts['full_text'] = (posts['title'].fillna('') + " " + posts['selftext'].fillna('')).apply(clean_text)

In [None]:
# sentiment analysis
def analyze_sentiment_dimensions(text):
    blob = TextBlob(text.lower())
    
    # overall sentiment
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # trust related sentiment
    trust_keywords = ['trust', 'reliable', 'dependable', 'honest', 'transparent']
    distrust_keywords = ['distrust', 'unreliable', 'corrupt', 'biased', 'unfair']

    # get trust related scores
    trust_score = sum([text.lower().count(word) for word in trust_keywords])
    distrust_score = sum([text.lower().count(word) for word in distrust_keywords])
    
    # effectiveness sentiment
    effective_keywords = ['effective', 'helpful', 'successful', 'works', 'solved']
    ineffective_keywords = ['ineffective', 'useless', 'failed', 'waste', 'pointless']

    # effectiveness scores
    effective_score = sum([text.lower().count(word) for word in effective_keywords])
    ineffective_score = sum([text.lower().count(word) for word in ineffective_keywords])
    
    return {
        'overall_sentiment': polarity,
        'subjectivity': subjectivity,
        'trust_net': trust_score - distrust_score,
        'effectiveness_net': effective_score - ineffective_score
    }

# apply and add to posts DF
sentiment_results = posts['full_text'].apply(analyze_sentiment_dimensions)
sentiment_df = pd.DataFrame(sentiment_results.tolist())
posts = pd.concat([posts, sentiment_df], axis=1)

In [None]:
# predictive policing attitude classification
def classify_predictive_policing_attitude(text):
    text = text.lower()
    
    # pro-technology policing indicators
    pro_tech = [
        'data', 'algorithm', 'predict', 'prevent', 'technology', 'smart', 
        'efficient', 'evidence-based', 'statistics', 'analysis', 'pattern'
    ]
    
    # anti-technology policing indicators
    anti_tech = [
        'surveillance', 'privacy', 'bias', 'discrimination', 'profiling',
        'freedom', 'rights', 'civil liberties'
    ]
    
    # community-focused policing indicators
    community = [
        'community', 'local', 'neighborhood', 'residents', 'engagement',
        'dialogue', 'relationship', 'trust', 'human'
    ]

    # get all scores
    pro_score = sum([text.count(word) for word in pro_tech])
    anti_score = sum([text.count(word) for word in anti_tech])
    community_score = sum([text.count(word) for word in community])

    # return 
    if pro_score > anti_score and pro_score > community_score:
        return 'pro_tech'
    elif anti_score > pro_score and anti_score > community_score:
        return 'anti_tech'
    elif community_score > 0:
        return 'community_focused'
    else:
        return 'neutral'

# apply and add to posts DF
posts['policing_attitude'] = posts['full_text'].apply(classify_predictive_policing_attitude)

In [None]:
# concern categories for predictive policing (true or false for each category)
def extract_concerns(text):
    text = text.lower()
    concerns = {}
    
    # privacy concerns
    privacy_words = ['privacy', 'surveillance', 'monitoring', 'tracking', 'data collection']
    concerns['privacy'] = any(word in text for word in privacy_words)
    
    # bias concerns
    bias_words = ['bias', 'discrimination', 'racial', 'profiling', 'unfair', 'prejudice']
    concerns['bias'] = any(word in text for word in bias_words)
    
    # accuracy concerns
    accuracy_words = ['wrong', 'mistake', 'inaccurate', 'false positive', 'error']
    concerns['accuracy'] = any(word in text for word in accuracy_words)
    
    # transparency concerns
    transparency_words = ['black box', 'transparent', 'explain', 'understand', 'accountability']
    concerns['transparency'] = any(word in text for word in transparency_words)
    
    # effectiveness concerns
    effectiveness_words = ['waste', 'ineffective', 'doesn\'t work', 'pointless', 'useless']
    concerns['effectiveness'] = any(word in text for word in effectiveness_words)
    
    return concerns

# apply to the text and add to posts DF
concern_results = posts['full_text'].apply(extract_concerns)
concern_df = pd.DataFrame(concern_results.tolist())
posts = pd.concat([posts, concern_df], axis=1)

In [None]:
# geographic insights
def extract_location_mentions(text):
    # london bouroughs
    london_areas = [
        'barking and dagenham', 'barnet', 'bexley', 'brent', 'bromley',
        'camden', 'croydon', 'ealing', 'enfield', 'greenwich', 'hackney',
        'hammersmith and fulham', 'haringey', 'harrow', 'havering', 'hillingdon',
        'hounslow', 'islington', 'kensington and chelsea', 'kingston upon thames',
        'lambeth', 'lewisham', 'merton', 'newham', 'redbridge', 'richmond upon thames',
        'southwark', 'sutton', 'tower hamlets', 'waltham forest', 'wandsworth',
        'westminster'
    ]

    # check if any of the locations were mentioned
    text = text.lower()
    mentioned_areas = [area for area in london_areas if area in text]
    return mentioned_areas if mentioned_areas else ['general']

# apply and add to posts DF
posts['mentioned_areas'] = posts['full_text'].apply(extract_location_mentions)

In [None]:
# experience indicators
def classify_experience_type(text):
    text = text.lower()

    # check what experience the poster had
    if any(word in text for word in ['my house', 'my home', 'my flat', 'my apartment', 'i was', 'happened to me']):
        return 'personal_victim'
    elif any(word in text for word in ['neighbor', 'friend', 'family', 'colleague']):
        return 'indirect_victim'
    elif any(word in text for word in ['witnessed', 'saw', 'observed']):
        return 'witness'
    elif any(word in text for word in ['heard', 'read', 'news', 'report']):
        return 'media_informed'
    else:
        return 'general_opinion'

# apply and add to posts DF
posts['experience_type'] = posts['full_text'].apply(classify_experience_type)

In [None]:
# full analysis
def create_comprehensive_analysis():
    print("Evaluation Analysis\n")
    
    # overall sentiment distribution
    print("Overall sentiment towards policing:")
    print(f"   Average sentiment: {posts['overall_sentiment'].mean():.3f}")
    print(f"   Positive sentiment: {(posts['overall_sentiment'] > 0).sum()} posts ({(posts['overall_sentiment'] > 0).mean()*100:.1f}%)")
    print(f"   Negative sentiment: {(posts['overall_sentiment'] < 0).sum()} posts ({(posts['overall_sentiment'] < 0).mean()*100:.1f}%)")
    
    # trust metrics
    print("\nTrust indicators:")
    print(f"   Net trust score: {posts['trust_net'].mean():.3f}")
    trust_positive = (posts['trust_net'] > 0).sum()
    trust_negative = (posts['trust_net'] < 0).sum()
    print(f"   Trust-positive posts: {trust_positive} ({trust_positive/len(posts)*100:.1f}%)")
    print(f"   Trust-negative posts: {trust_negative} ({trust_negative/len(posts)*100:.1f}%)")
    
    # predictive policing attitudes
    print("\nAttitudes towards predictive policing:")
    attitude_counts = posts['policing_attitude'].value_counts()
    for attitude, count in attitude_counts.items():
        print(f"   {attitude}: {count} posts ({count/len(posts)*100:.1f}%)")
    
    # main concerns
    print("\nMain concerns:")
    concern_cols = ['privacy', 'bias', 'accuracy', 'transparency', 'effectiveness']
    for concern in concern_cols:
        count = posts[concern].sum()
        print(f"   {concern.capitalize()}: {count} mentions ({count/len(posts)*100:.1f}%)")
    
    # experience types
    print("\nExperience types:")
    exp_counts = posts['experience_type'].value_counts()
    for exp_type, count in exp_counts.items():
        print(f"   {exp_type}: {count} posts ({count/len(posts)*100:.1f}%)")
    
    # geographic distribution
    print("\nGeographic mentions:")
    all_areas = [area for areas in posts['mentioned_areas'] for area in areas]
    area_counts = Counter(all_areas)
    for area, count in area_counts.most_common(10):
        print(f"   {area}: {count} mentions")

In [None]:
# create visualizations
def create_visualizations():
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Sentiment distribution
    axes[0,0].hist(posts['overall_sentiment'], bins=20, alpha=0.7, color='skyblue')
    axes[0,0].set_title('Overall Sentiment Distribution')
    axes[0,0].set_xlabel('Sentiment Score')
    axes[0,0].set_ylabel('Frequency')
    
    # Trust vs Effectiveness
    axes[0,1].scatter(posts['trust_net'], posts['effectiveness_net'], alpha=0.6)
    axes[0,1].set_title('Trust vs Effectiveness Perception')
    axes[0,1].set_xlabel('Trust Net Score')
    axes[0,1].set_ylabel('Effectiveness Net Score')
    
    # Policing attitudes
    attitude_counts = posts['policing_attitude'].value_counts()
    axes[0,2].pie(attitude_counts.values, labels=attitude_counts.index, autopct='%1.1f%%')
    axes[0,2].set_title('Attitudes Toward Tech-Based Policing')
    
    # Concerns breakdown
    concern_cols = ['privacy', 'bias', 'accuracy', 'transparency', 'effectiveness']
    concern_counts = [posts[col].sum() for col in concern_cols]
    axes[1,0].bar(concern_cols, concern_counts, color='lightcoral')
    axes[1,0].set_title('Main Concerns About Predictive Policing')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Experience types
    exp_counts = posts['experience_type'].value_counts()
    axes[1,1].bar(exp_counts.index, exp_counts.values, color='lightgreen')
    axes[1,1].set_title('Types of Crime Experience')
    axes[1,1].tick_params(axis='x', rotation=45)
    
    # Sentiment by experience type
    sns.boxplot(data=posts, x='experience_type', y='overall_sentiment', ax=axes[1,2])
    axes[1,2].set_title('Sentiment by Experience Type')
    axes[1,2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# export
def export_analysis_ready_data():
    """Export processed data for further analysis"""
    
    # Create summary dataset
    analysis_columns = [
        'id', 'subreddit', 'score', 'num_comments', 'created_utc',
        'overall_sentiment', 'subjectivity', 'trust_net', 'effectiveness_net',
        'policing_attitude', 'experience_type', 'mentioned_areas',
        'privacy', 'bias', 'accuracy', 'transparency', 'effectiveness'
    ]
    
    analysis_df = posts[analysis_columns].copy()
    analysis_df.to_csv('reddit_analysis_processed.csv', index=False)
    
    print("Processed data exported to 'reddit_analysis_processed.csv'")
    
    return analysis_df

In [None]:
create_comprehensive_analysis()
create_visualizations()
processed_data = export_analysis_ready_data()

print(f"\nAnalysis complete! Processed {len(posts)} posts.")