<a href="https://colab.research.google.com/github/Abdoulrasheed/crime_prediction_model/blob/master/crime_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
import numpy as np
import pandas as pd

import folium
import re, string, random
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples as t_sample

# used for training our model
from nltk import classify
from nltk import NaiveBayesClassifier

# reporting library
import plotly.graph_objects as pl

from sklearn.linear_model import LinearRegression

In [3]:
from geopy.geocoders import Nominatim

def get_coordinates(address):
    geocoder = Nominatim(user_agent="crime_prediction", timeout=10000)
    location = geocoder.geocode(address)
    print(f'{location} [lon: {location.longitude}, lat: {location.latitude}]')
    return [location.longitude, location.latitude]

In [4]:
class Model:
    sentiments = []
    
    def __init__(self):
        print("Initializing model...\n")
    
    def convert_to_dict(self, token_list):
        """
            Yield a dictionary with each token as key and it's value True
        """
        
        for tweet_tokens in token_list:
            yield dict([token, True] for token in tweet_tokens)
    
    def count_results(self):
        """ Count the number of positive and negative data from each sentiment """
        
        print("Processing result of nlp...")
        
        no_of_2015_positive = np.count_nonzero(self.sen_2015 == "Positive")
        print(no_of_2015_positive)
        
        no_of_2015_negative = np.count_nonzero(self.sen_2015 == "Negative")
    
        no_of_2016_positive = np.count_nonzero(self.sen_2016 == "Positive")
        no_of_2016_negative = np.count_nonzero(self.sen_2016 == "Negative")
        
        no_of_2017_positive = np.count_nonzero(self.sen_2017 == "Positive")
        no_of_2017_negative = np.count_nonzero(self.sen_2017 == "Negative")
        
        no_of_2018_positive = np.count_nonzero(self.sen_2018 == "Positive")
        no_of_2018_negative = np.count_nonzero(self.sen_2018 == "Negative")
        
        no_of_2019_positive = np.count_nonzero(self.sen_2019 == "Positive")
        no_of_2019_negative = np.count_nonzero(self.sen_2019 == "Negative")
        
        aggregates = {
                "2015": {"pos": no_of_2015_positive, "neg": no_of_2015_negative}, 
                "2016": {"pos": no_of_2016_positive, "neg": no_of_2016_negative},
                "2017": {"pos": no_of_2017_positive, "neg": no_of_2017_negative},
                "2018": {"pos": no_of_2018_positive, "neg": no_of_2018_negative},
                "2019": {"pos": no_of_2019_positive, "neg": no_of_2019_negative},
                }
        print("Done\n")
        return aggregates
    
    def get_geocoordinates(self):
        self.longitudes = []
        self.latitudes = []
        
        for address in self.dataset.address:
            loc = get_coordinates(address)
            self.longitudes.append(loc[0])
            self.latitudes.append(loc[1])
            
        self.dataset['longitudes'] = self.longitudes
        self.dataset['latitudes'] = self.latitudes
        self.dataset.to_csv("sample_data/warehouse.csv")
    
    def generate_heatmap(self):
      heatmap = folium.Map(location=[9.2592697, 12.405366], zoom_start=10)
      # self.get_geocoordinates()

      # Adding markers to the map
      for row in self.sentiments:
        colour = "red" if row[2] == "Negative" else "blue"

        html = folium.Html(f"<div><img width='35' src='https://p7.hiclipart.com/preview/861/726/346/computer-icons-professional-avatar-avatar.jpg'/></div><b>Name:</b> {row[0]}<br><b>Tweet:</b> {row[1][:15]}...<br><b>Sentiment:</b> {row[2]}<br><b>Crime:</b> {row[5]}<br><b>Behaviour Prediction: N</b>", script=True)
        popup = folium.Popup(html, max_width=2850)

        marker = folium.Marker(
            location=[float(row[3]), float(row[4])], 
            popup=popup, 
            tooltip="View Detail",
            icon=folium.Icon(color=colour, icon="user"),
          )
        marker.add_to(heatmap)

      return heatmap
            
    def lemmatize_sentence(self, tokens):
        """ 
            analyzes the structure of the word and its context 
            to convert it to a normalized form. eg the word players or playing becomes play.
        """
        
        print("Lemmatizing tokens...")
        
        lemmatizer = WordNetLemmatizer()
        lemmatized_sentence = []
        for word, tag in pos_tag(tokens):
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
        
        print("Done\n")
        return lemmatized_sentence

    def predict(self):
        print("Finalizing ...\n")
        
        self.aggs = self.count_results()
        years_pos_sentiments = [[self.aggs['2015']['pos']], [self.aggs['2016']['pos']], [self.aggs['2017']['pos']], [self.aggs['2018']['pos']], [self.aggs['2019']['pos']]]
        years_neg_sentiments = [[self.aggs['2015']['neg']], [self.aggs['2016']['neg']], [self.aggs['2017']['neg']], [self.aggs['2018']['neg']], [self.aggs['2019']['neg']]]
        
        regressor = LinearRegression()
        regressor.fit(years_pos_sentiments, years_neg_sentiments)
        
        self.year_to_predict = input("Please enter the year you wants to predict: ")
        # Predict the rate of crime in the provided year
        return regressor.predict([[float(self.year_to_predict)]])

    def remove_noise(self, tweet_tokens):
        """
            Remove punctuations, stopwords and hyperlinks 
        """

        cleaned_tokens = []
        stop_words = stopwords.words('english')

        for token, tag in pos_tag(tweet_tokens):
            token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                        '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
            token = re.sub("(@[A-Za-z0-9_]+)","", token)

            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'

            lemmatizer = WordNetLemmatizer()
            token = lemmatizer.lemmatize(token, pos)

            if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
                cleaned_tokens.append(token.lower())
        return cleaned_tokens
    
    def prepare_training_datasets(self, positive_tokens_for_model, negative_tokens_for_model):
        print("Preparing training data")
        
        positive_dataset = [(data_dict, "Positive") for data_dict in positive_tokens_for_model]
        negative_dataset = [(data_dict, "Negative") for data_dict in negative_tokens_for_model]

        # combine our modelled datasets
        dataset = positive_dataset + negative_dataset

        # randomnize the dataset to avoid bias as 
        # the data contains all positive data followed by all negative data in sequence
        random.shuffle(dataset)
        print("Done\n")
        return dataset
    
    def plot(self, predicted_result):
        print("Plotting Graph")
        
        years_pos_sentiments = (self.aggs['2015']['pos'], self.aggs['2016']['pos'], self.aggs['2017']['pos'], self.aggs['2018']['pos'], self.aggs['2019']['pos'])
        years_neg_sentiments = (self.aggs['2015']['neg'], self.aggs['2016']['neg'], self.aggs['2017']['neg'], self.aggs['2018']['neg'], self.aggs['2019']['neg'])
        years_pos_sentiments += (abs(predicted_result[0].tolist()[0]/50),)
        years_neg_sentiments += (abs(predicted_result[0].tolist()[0]/165),)
        
        print("Generating graph ...")
        
        predicted_year = f"{self.year_to_predict} Projection"
        
        years = ['2015', '2016', '2017', '2018', '2019', predicted_year]
        x = list(range(len(years)))

        # Specify the plots
        bar_plots = [
            pl.Bar(x=x, y=years_pos_sentiments, name='Positive', marker=pl.bar.Marker(color='#0343df')),
            pl.Bar(x=x, y=years_neg_sentiments, name='Negative', marker=pl.bar.Marker(color='#e50000')),
        ]
        
        # Specify the layout
        layout = pl.Layout(
            title=pl.layout.Title(text="NLP and Machine Learning Crime Prediction", x=0.5),
            yaxis_title="Crime Rate",
            xaxis_tickmode="array",
            xaxis_tickvals=list(range(27)),
            xaxis_ticktext=tuple(years),
        )
            
        # Make the multi-bar plot
        fig = pl.Figure(data=bar_plots, layout=layout)

        # Tell Plotly to render it
        fig.show()
        print("Done\n")
        
        return self.generate_heatmap()
    
    def tokenize(self):
        print("Tokenizing training data...\n")
        self.pos_data_tokens = t_sample.tokenized('positive_tweets.json')
        self.neg_data_tokens = t_sample.tokenized('negative_tweets.json')
    
    def train(self, dataset):
        self.dataset = dataset
        pos_data_tokens_list = []
        neg_data_tokens_list = []
        
        self.tokenize()
        
        print("Removing punctuations, stopwords and hyperlinks ...")
        for p_tokens, n_tokens in zip(self.pos_data_tokens, self.neg_data_tokens):
            pos_data_tokens_list.append(self.remove_noise(p_tokens))
            neg_data_tokens_list.append(self.remove_noise(n_tokens))
        print("Done\n")

        print("Converting tokenized_words...")
        positive_tokens_for_model = self.convert_to_dict(pos_data_tokens_list)
        negative_tokens_for_model = self.convert_to_dict(neg_data_tokens_list)
        print("Done\n")
        
        train_data = self.prepare_training_datasets(positive_tokens_for_model, negative_tokens_for_model)
        classifier = NaiveBayesClassifier.train(train_data)
        
        for row in dataset.values:
            name = row[0].capitalize()
            tweet = row[5]
            lon = row[6]
            lat = row[7]
            crime = row[2]
            sms = row[4]
            
            tokenised_tweet = self.remove_noise(word_tokenize(tweet))
            result = classifier.classify(dict([token, True] for token in tokenised_tweet))
            self.sentiments.append([name, tweet, result, lon, lat, crime, sms])
        
        # print("Accuracy is:", classify.accuracy(classifier, train_data))
        # print(classifier.show_most_informative_features(15))
        
        # split the sentiments into 5 different groups, each representing an imaginary report
        # for a specific year
        splitted = np.array_split(self.sentiments, 5)
        self.sen_2015, self.sen_2016, self.sen_2017, self.sen_2018, self.sen_2019 = splitted

In [None]:
heatmap = ''
def get_data_warehouse():
    names = ["fullname", "phone", "crime", "address", "sms", "tweets", "longitude", "latitude"]
    return pd.read_csv('sample_data/warehouse.csv', names=names, keep_default_na=False, na_values=[""])

def main():
    print("Reading data warehouse...\n")
    data_warehouse = get_data_warehouse()
    
    model = Model()
    model.train(data_warehouse)
    predicted_result = model.predict()
    print(f"Prediction Results: {predicted_result}")
    return model.plot(predicted_result)

if __name__ == "__main__":
    heatmap = main()
heatmap

Reading data warehouse...

Initializing model...

Tokenizing training data...

Removing punctuations, stop words and hyperlinks ...
Done

Converting tokenized_words...
Done

Preparing training data
Done

Finalizing ...

Processing result of nlp...
58
Done

Please enter the year you wants to predict: 2023
Prediction Results: [[-1876.39723926]]
Plotting Graph
Generating graph ...


Done

