In [None]:
import numpy as np
import pandas as pd
import nltk as nltk
import re
import string
import nltk


from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Loading Data

In [None]:
df_car = pd.read_csv("/content/car_accidents_reports.csv")
df_crime = pd.read_csv("/content/crime_reports.csv")
df_fire = pd.read_csv("/content/fire_reports.csv")
df_robbery = pd.read_csv("/content/robbery_reports.csv")

In [None]:
pd.set_option('display.max_colwidth', 500)   # set the column width in the output to 500 charachters

# View and Explore Data

In [None]:
df_car.shape

(1000, 1)

In [None]:
df_crime.shape

(1000, 1)

In [None]:
df_fire.shape

(1000, 1)

In [None]:
df_robbery.shape

(1000, 1)

In [None]:
# Adding new column "label" to each dataframe
df_car.insert(loc=1, column='label', value='car accidents')
df_crime.insert(loc=1, column='label', value='crime')
df_fire.insert(loc=1, column='label', value='fire')
df_robbery.insert(loc=1, column='label', value='robbery')

In [None]:
# Concatenate all the 4 dataframe
df = pd.concat([df_car, df_crime,df_fire,df_robbery ], axis=0)
df.reset_index(drop=True, inplace=True)
df.head(100)

Unnamed: 0,Report,label
0,This is John Doe from 9999 Ash Street. There's a multi-car pileup on the highway. It looks like there might be injuries,car accidents
1,"Hello, I'm Jane Smith at 123 Main Street. I've just witnessed a car crash into a shop front",car accidents
2,"Help, I'm Isabella Jackson and I'm calling from 789 Pine Lane. A car just hit a pedestrian and drove off",car accidents
3,"Hello, this is Robert Johnson at 2345 Hickory Street. I just saw a car hit a cyclist. The driver didn't stop",car accidents
4,"Emergency, this is Olivia Taylor from 789 Pine Lane. I just saw a car crash into a fence",car accidents
...,...,...
95,"Emergency, this is Robert Johnson from 654 Maple Road. I just saw a car crash into a fence",car accidents
96,"Help, I'm John Doe and I'm calling from 7777 Hemlock Circle. A car just crashed into a tree. The driver seems to be unconscious",car accidents
97,"Help, I am Ava Thomas and I'm calling from 8765 Juniper Road. A car just crashed into a tree. The driver seems to be unconscious and the car is leaking gasoline",car accidents
98,"Hello, I'm Harper Anderson at 456 Oak Avenue. I've just witnessed a motorcycle crash. The rider was thrown off and is lying on the ground",car accidents


In [None]:
df.shape

(4000, 2)

In [None]:
df.describe()

Unnamed: 0,Report,label
count,4000,4000
unique,3796,4
top,"Emergency, this is Emily Davis from 7777 Hemlock Circle. I just saw a group of people robbing a liquor store",car accidents
freq,3,1000


In [None]:
df.label.value_counts()

car accidents    1000
crime            1000
fire             1000
robbery          1000
Name: label, dtype: int64

In [None]:
# check the data types of each column
print(df.dtypes)

Report    object
label     object
dtype: object


# Cleaning Data

In [None]:
# Check for missing values again
print("Number of missing values in each column:")
print(df.isnull().sum())

Number of missing values in each column:
Report    0
label     0
dtype: int64


In [None]:
def clean_text(text):

    # remove punct
    punctuation_re = re.compile('[%s]' % re.escape(string.punctuation))
    no_punc = punctuation_re.sub('', text)

    # convert to lowercase
    lower_text = no_punc.lower()

    # remove numbers
    number_re = re.compile(r'\d+')
    no_numbers = number_re.sub('', lower_text)

    # tokenize
    tokens = nltk.word_tokenize(no_numbers)

    # stopwords
    stop_words = stopwords.words('english')
    no_stop = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(token) for token in no_stop]

    return ' '.join(stemmed)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df['clean_text'] = df['Report'].apply(lambda x: clean_text(x))
df

Unnamed: 0,Report,label,clean_text
0,This is John Doe from 9999 Ash Street. There's a multi-car pileup on the highway. It looks like there might be injuries,car accidents,john doe ash street there multicar pileup highway look like might injuri
1,"Hello, I'm Jane Smith at 123 Main Street. I've just witnessed a car crash into a shop front",car accidents,hello im jane smith main street ive wit car crash shop front
2,"Help, I'm Isabella Jackson and I'm calling from 789 Pine Lane. A car just hit a pedestrian and drove off",car accidents,help im isabella jackson im call pine lane car hit pedestrian drove
3,"Hello, this is Robert Johnson at 2345 Hickory Street. I just saw a car hit a cyclist. The driver didn't stop",car accidents,hello robert johnson hickori street saw car hit cyclist driver didnt stop
4,"Emergency, this is Olivia Taylor from 789 Pine Lane. I just saw a car crash into a fence",car accidents,emerg olivia taylor pine lane saw car crash fenc
...,...,...,...
3995,"Emergency, this is Amelia Thompson from 7777 Hemlock Circle. I just saw a group of people robbing a liquor store",robbery,emerg amelia thompson hemlock circl saw group peopl rob liquor store
3996,"Help, my name is James Brown. I'm at 5555 Walnut Street and my house was just broken into",robbery,help name jame brown im walnut street hous broken
3997,"Help, my name is Liam Williams. I'm at 8888 Alder Court and my purse was just snatched. A man on a bike grabbed it and rode off",robbery,help name liam william im alder court purs snatch man bike grab rode
3998,This is Emily Davis from 3456 Locust Way. I work at the bank and we've just been robbed. A man with a gun demanded all the money and then he fled the scene. I'm scared for my life and I don't know what to do,robbery,emili davi locust way work bank weve rob man gun demand money fled scene im scare life dont know


# Extract Features

In [None]:
# Create a LabelEncoder object
encoder = LabelEncoder()

# Fit the encoder to the labels and transform the labels
df["label"] = encoder.fit_transform(df["label"])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [None]:
tdidf_vect= TfidfVectorizer()
tdidf = tdidf_vect.fit_transform(df['clean_text'])

features_tdidf = pd.DataFrame(tdidf.toarray())
features_tdidf.columns = tdidf_vect.get_feature_names_out()

# Machine Learning Model


In [None]:
labels = df['label']
features = pd.concat( [features_tdidf], axis=1)
features.shape

(4000, 330)

## Split Data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3000, 330) (1000, 330) (3000,) (1000,)


## Selection Method

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_jobs=-1)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

1.0


### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
NBmodel = MultinomialNB()
NBmodel.fit(x_train, y_train)
print(NBmodel.score(x_test, y_test))

0.984


### SVC

In [None]:
from sklearn import model_selection, naive_bayes, svm

SVM_model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_model.fit(x_train, y_train)
print(SVM_model.score(x_test, y_test))

1.0


In [None]:
df

Unnamed: 0,Report,label,clean_text
0,This is John Doe from 9999 Ash Street. There's a multi-car pileup on the highway. It looks like there might be injuries,0,john doe ash street there multicar pileup highway look like might injuri
1,"Hello, I'm Jane Smith at 123 Main Street. I've just witnessed a car crash into a shop front",0,hello im jane smith main street ive wit car crash shop front
2,"Help, I'm Isabella Jackson and I'm calling from 789 Pine Lane. A car just hit a pedestrian and drove off",0,help im isabella jackson im call pine lane car hit pedestrian drove
3,"Hello, this is Robert Johnson at 2345 Hickory Street. I just saw a car hit a cyclist. The driver didn't stop",0,hello robert johnson hickori street saw car hit cyclist driver didnt stop
4,"Emergency, this is Olivia Taylor from 789 Pine Lane. I just saw a car crash into a fence",0,emerg olivia taylor pine lane saw car crash fenc
...,...,...,...
3995,"Emergency, this is Amelia Thompson from 7777 Hemlock Circle. I just saw a group of people robbing a liquor store",3,emerg amelia thompson hemlock circl saw group peopl rob liquor store
3996,"Help, my name is James Brown. I'm at 5555 Walnut Street and my house was just broken into",3,help name jame brown im walnut street hous broken
3997,"Help, my name is Liam Williams. I'm at 8888 Alder Court and my purse was just snatched. A man on a bike grabbed it and rode off",3,help name liam william im alder court purs snatch man bike grab rode
3998,This is Emily Davis from 3456 Locust Way. I work at the bank and we've just been robbed. A man with a gun demanded all the money and then he fled the scene. I'm scared for my life and I don't know what to do,3,emili davi locust way work bank weve rob man gun demand money fled scene im scare life dont know


# Prediction

In [None]:
new_text = input("enter the text:")


new_text = clean_text(new_text)
print("The Cleaned Text: ",new_text)


# Vectorize the new text using the same vectorizer
new_text_vectorized = tdidf_vect.transform([new_text])


# Make the prediction
prediction = model.predict(new_text_vectorized)
prediction_proba = model.predict_proba(new_text_vectorized)

# Get the probability of the predicted class label
class_index = prediction[0]


# Get the probability of the predicted class label
confidence = prediction_proba[0, class_index]

# Convert the prediction to the corresponding class label
if prediction[0]== 0:
  p = "car accidents"
elif prediction[0]== 1:
  p= "crime"
elif prediction[0]== 2:
  p = "fire"
else:
  p= "robbery"

print("Predicted class label:", p)
print("Confidence:", confidence)

enter the text:هناك حادث حريق
The Cleaned Text:  هناك حادث حريق
Predicted class label: crime
Confidence: 0.5


