In [None]:
import pandas as pd
import numpy as np
import random
import json
import re

In [None]:
# --- SYNTHETIC SURVEY ANSWERS ---
# creating examplary dataset of survey answers
# to showcase visualizations for the police and community tool 

# --- 0. Datasets ---- 
df_crime = pd.read_csv("all_crimes_2022-2025.csv")
lsoas    = df_crime["LSOA code"].unique().tolist()

# --- 1. Survey Answers  ---
edge_good = ["E33979","E33983","E33987","E33989","E33992","E35292"]
edge_bad  = ["E529759","E529760","E529761","E529762","E529763","E529764"]
# i selected the edge cased based on the crime data map
# to create visible difference between percieved and predicted safety
# i selected very good crime-wise LSOAs and gave them bad reviews and vice versa

with open("posts-dataset.json", encoding="utf-8") as f:
    posts_json = json.load(f)
texts = [re.sub(r"\s+", " ", p["text"]).strip() for p in posts_json]

questions   = [f"question_{i}" for i in range(1, 10)]
survey_rows = []

for code in lsoas:
    n_resp = random.randint(2, 5)
    if code in edge_good:
        numeric_block = [[5]*9] * n_resp
    elif code in edge_bad:
        numeric_block = [[1]*9] * n_resp
    else:
        numeric_block = np.random.randint(1, 6, size=(n_resp, 9)).tolist()
    open_block = random.choices(texts, k=n_resp)
    for nums, txt in zip(numeric_block, open_block):
        survey_rows.append([code, *nums, txt])

cols      = ["LSOA code"] + questions + ["open_answer"]
survey_df = pd.DataFrame(survey_rows, columns=cols)

# --- 2. Mean and Normalization ---
survey_df["survey_mean"]      = survey_df[questions].mean(axis=1)
survey_df["survey_normalized"] = (survey_df["survey_mean"] - 1) / 4

survey_df.to_csv("survey_with_text.csv", index=False)
# if we will save survey answers 
# save to survey_with_text.csv
print(f"saved {len(survey_df)}: survey_with_text.csv")

In [None]:
import pandas as pd
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon', quiet=True)

In [None]:
# --- TOPIC MATCHING AND SENTIMENT ANALYSIS ---
# can be acessed when additional answers will be saved from the survey

# --- 0. Datasets ---- 
survey_df = pd.read_csv("survey_with_text.csv")

# --- 1. Preset Topics ---
# topics based on literature research (correlation with increased burglary) 
# predefined topics so police offciers can have conrol over them

topic_keywords = {
    "holiday_empty_homes": [
        "getaway", "holiday", "vacation", "away", "out of town",
        "empty home", "no one home", "not there", "out on holiday",
        "trust house sitter", "left house", "holiday home", "airbnb",
    ],
    "vacant_unoccupied_properties": [
        "vacant", "unoccupied", "abandoned", "empty property",
        "empty flat", "derelict", "boarded up", "for sale sign",
        "for rent sign", "nobody living", "vacant lot"
    ],
    "community_watch": [
        "neighbourhood watch", "community watch", "watch group", "whatsapp group",
        "facebook group", "alert group", "block watch", "street watch",
        "community patrol", "resident group", "neighbour alert", "vigilante"
    ],
    "poor_street_lighting_cctv": [
        "streetlight", "lighting", "dark alley", "poor lighting",
        "lights out", "street lights off", "no street light",
        "cctv", "camera not working", "broken camera", "no cameras", "cctv down",
        "lamp post", "flickering light"
    ],
    "police_visibility_response_time": [
        "police", "officer", "patrol", "visible police", "see police",
        "no police", "took an hour", "response time", "slow response", 
        "waited for police", "call 112", "urgent call", "no response"
    ],
    "reporting_barriers_mistrust": [
        "didn't report", "won't report", "no point calling", "don't trust police",
        "police won't help", "afraid to report", "didn't bother reporting",
        "not confident to call", "fear police", "mistrust police", "police ignore"
    ],
    "drug_abuse_hotspots": [
        "drug", "drugs", "dealers", "drug dealing", "narcotics",
        "heroin", "cocaine", "cannabis", "weed smell", "smell weed", 
        "crack", "meth", "overdose", "drug house", "drug den"
    ]
}

# --- 2. Topic Matching ---
# processing open answers 

def match_topics(text):
    text = text.lower()
    matches = []
    for topic, kws in topic_keywords.items():
        for kw in kws:
            if kw in text:
                matches.append(topic)
                break
    return matches or ["other"]

survey_df["matched_topics"] = survey_df["open_answer"].astype(str).apply(match_topics)
survey_df["matched_topics_str"] = survey_df["matched_topics"].apply(lambda lst: ", ".join(lst))

# --- 3. Sentiment Scores ---
sia = SentimentIntensityAnalyzer()
survey_df["sentiment_score"] = survey_df["open_answer"].astype(str).apply(
    lambda t: sia.polarity_scores(t)["compound"]
)

exploded = survey_df.explode("matched_topics")
topic_sentiment = (
    exploded
      .groupby("matched_topics")
      .agg(
          count_posts    = ("sentiment_score","size"),
          avg_sentiment  = ("sentiment_score","mean"),
          median_sentiment = ("sentiment_score","median")
      )
      .reset_index()
      .sort_values("avg_sentiment")
)

survey_df.to_csv("survey_with_topics.csv", index=False)
topic_sentiment.to_csv("sentiment_summary.csv", index=False)

print("survey_with_topics.csv")
print("sentiment_summary.csv")