### 1. Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

#### 2. Loading the Dataset

In [None]:
#Reading the CSV file into a DataFrame
tv= pd.read_csv("Traffic_Violations.csv")

In [None]:
#Displaying the first 5 rows of the DataFrame
print(tv.head())

#### 3. Data Preprocessing

In [None]:
#Checking for missing values
print(tv.isnull().sum())

In [None]:
#Droping irrelevant columns
irrelevant_columns = ['SeqID', 'Location', 'Latitude', 'Longitude', 'Geolocation']
tv = tv.drop(irrelevant_columns, axis=1)

In [None]:
#Converting Date Of Stop column to datetime type
tv['Date Of Stop'] = pd.to_datetime(tv['Date Of Stop'], dayfirst=True)

In [None]:
#Converting Time Of Stop column to datetime type
tv['Time Of Stop'] = pd.to_datetime(tv['Time Of Stop'], format='%H:%M:%S').dt.time

In [None]:
#Filling missing values in categorical columns with 'Unknown'
categorical_columns = ['Agency', 'SubAgency', 'Charge', 'Article', 'Race', 'Gender', 'Driver City', 'Driver State', 'DL State', 'Arrest Type']
tv[categorical_columns] = tv[categorical_columns].fillna('Unknown')

In [None]:
#Filling missing values in numerical columns with mean or appropriate value
tv['Year'] = tv['Year'].fillna(tv['Year'].mean())


In [None]:
#Performing one-hot encoding for categorical variables (if needed)
#For example, if 'Violation Type' is a categorical variable, you can encode it using one-hot encoding:
#tv = pd.get_dummies(tv, columns=['Violation Type'])

In [None]:
#Checking the updated DataFrame
print(tv.head())

#### 4. Exploratory Data Analysis (EDA)

In [None]:
#%pip install seaborn

In [None]:
import seaborn as sns

In [None]:
# Generate descriptive statistics
print(tv.describe())

In [None]:
# Plot a histogram of the 'Year' variable
plt.figure(facecolor='lightgrey')
plt.hist(tv['Year'], bins=10, color='lightgreen')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Distribution of Year')
plt.show()

In [None]:
# Generate a bar plot of the top 10 most frequent 'Violation Type'
top_violation_types = tv['Violation Type'].value_counts().head(10)
colors = ['pink', 'purple', 'orange']

plt.bar(top_violation_types.index, top_violation_types.values,color=colors)
plt.xlabel('Violation Type')
plt.ylabel('Count')
plt.title('Top 10 Violation Types')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Create a scatter plot of 'Alcohol' vs. 'Property Damage'
fig, pt = plt.subplots(figsize=(8, 6))
pt.set_facecolor('skyblue') 
pt.scatter(tv['Alcohol'], tv['Property Damage'], color='red')
pt.set_xlabel('Alcohol')
pt.set_ylabel('Property Damage')
pt.set_title('Alcohol vs. Property Damage')

plt.show()

#### 5. Feature Engineering

In [None]:
# Convert 'Date Of Stop' to datetime
tv['Date Of Stop'] = pd.to_datetime(tv['Date Of Stop'])

In [None]:
# Extract day of the week, month, and hour features
tv['Day Of Week'] = tv['Date Of Stop'].dt.dayofweek
tv['Month'] = tv['Date Of Stop'].dt.month
tv['Hour'] = pd.to_datetime(tv['Time Of Stop'], format='%H:%M:%S').dt.hour

In [None]:
# One-hot encode 'Violation Type'
tv_encoded = pd.get_dummies(tv, columns=['Violation Type'], prefix='Violation')

In [None]:
print(tv_encoded.head())

In [None]:
print(tv_encoded.columns)

In [None]:
# Convert 'Alcohol' and 'Property Damage' columns to numeric values
tv['Alcohol'] = tv['Alcohol'].map({'Yes': 1, 'No': 0})
tv['Property Damage'] = tv['Property Damage'].map({'Yes': 1, 'No': 0})

In [None]:
# Create interaction feature between 'Alcohol' and 'Property Damage'
tv['Alcohol_PropertyDamage'] = tv['Alcohol'] * tv['Property Damage']

In [None]:
# Get the current year
current_year = pd.to_datetime('today').year

In [None]:
# Calculate the age of the driver
tv['Age'] = current_year - tv['Year']

In [None]:
# Bin 'Age' into categories
bins = [0, 18, 30, 50, np.inf]
labels = ['Teen', 'Young Adult', 'Adult', 'Senior']
tv['Age_Category'] = pd.cut(tv['Age'], bins=bins, labels=labels)

In [None]:
print(tv_encoded.head())

In [None]:
print(tv_encoded.columns)

In [None]:
# Convert 'Accident' column to numeric values
tv_encoded['Accident'] = tv_encoded['Accident'].map({'No': 0, 'Yes': 1})

In [None]:
print(tv_encoded.head())


In [None]:
print(tv_encoded.columns)

#### 6. Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Specify the features (X) and the target variable (y)
features = [ 'Alcohol', 'Accident', 'Gender']
target = 'Violation_Citation'

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tv_encoded[features], tv_encoded[target], test_size=0.2, random_state=42)


In [None]:
selected_columns = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']
specific_columns_data = tv_encoded[selected_columns]

print(specific_columns_data)


In [None]:
# Convert 'Alcohol' column to numeric values
tv_encoded['Alcohol'] = tv_encoded['Alcohol'].map({'No': 0, 'Yes': 1})



In [None]:
selected_columns = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']
specific_columns_data = tv_encoded[selected_columns]

print(specific_columns_data)


In [None]:
# Convert 'Gender' column to numeric values
tv_encoded['Gender'] = tv_encoded['Gender'].map({'M': 1, 'F': 2, 'U':3})

In [None]:
selected_columns = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']
specific_columns_data = tv_encoded[selected_columns]

print(specific_columns_data)


In [None]:
from sklearn.model_selection import train_test_split

# Specify the features (X) and the target variable (y)
features = [ 'Alcohol', 'Accident', 'Gender']
target = 'Violation_Citation'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tv_encoded[features], tv_encoded[target], test_size=0.2, random_state=42)


#### 7. Feature Selection

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Calculate the correlation matrix
correlation_matrix = tv_encoded.corr()

In [None]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='magma', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Drop NaN values from specific columns
columns_to_clean = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']
tv_encoded_cleaned = tv_encoded.dropna(subset=columns_to_clean)


In [None]:
# Drop rows with float64 values in specific columns
columns_to_check = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']
tv_encoded_cleaned = tv_encoded_cleaned.dropna(subset=columns_to_check)

In [None]:
# Check for float64 and NaN values in specific columns
columns_to_check = ['Alcohol', 'Accident', 'Gender', 'Violation_Citation']


In [None]:
# Check for NaN values
nan_values = tv_encoded_cleaned[columns_to_check].isnull().sum()
print("NaN Values:")
print(nan_values)


In [None]:
# Check for float64 values
float_values = tv_encoded_cleaned[columns_to_check].apply(lambda x: x.dtype == 'float64')
print("Float64 Values:")
print(float_values)


#### 8. Model Training & Evaluation (Logistic Regression/Decision Tree/Random Forest)

In [None]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Specify the features (X) and the target variable (y)
features = ['Alcohol', 'Accident', 'Gender']
target = 'Violation_Citation'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tv_encoded[features], tv_encoded[target], test_size=0.2, random_state=42)

# Create an instance of the Logistic Regression model
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)



In [None]:
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create an instance of the Decision Tree model
decision_tree = DecisionTreeClassifier()

# Fit the model to the training data
decision_tree.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_dt = decision_tree.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)


# Print the evaluation metrics
print("Decision Tree Metrics:")
print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1 score:", f1_dt)


In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create an instance of the Random Forest model
random_forest = RandomForestClassifier()
# Fit the model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = random_forest.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Print the evaluation metrics
print("Random Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 score:", f1_rf)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Create a meshgrid of feature values
x1_min, x1_max = X_train.iloc[:, 0].min() - 1, X_train.iloc[:, 0].max() + 1
x2_min, x2_max = X_train.iloc[:, 1].min() - 1, X_train.iloc[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.1),
                       np.arange(x2_min, x2_max, 0.1))
xx3 = np.full_like(xx1, X_train['Gender'].mean())  # Add the mean value of 'Gender' as a constant

In [None]:
# Concatenate the three features
X_meshgrid = np.c_[xx1.ravel(), xx2.ravel(), xx3.ravel()]

In [None]:
# Make predictions on the meshgrid
Z = logreg.predict(X_meshgrid)
Z = Z.reshape(xx1.shape)

In [None]:
# Plotting the decision boundaries
plt.contourf(xx1, xx2, Z, alpha=0.3)
plt.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], c=y_train, edgecolors='k')
plt.xlabel('Alcohol')
plt.ylabel('Accident')
plt.title('Logistic Regression Decision Boundaries')
plt.show()

In [None]:
from sklearn.tree import plot_tree

In [None]:
# Plotting the decision tree
plt.figure(figsize=(10, 6))
plot_tree(decision_tree, feature_names=features, filled=True)
plt.title('Decision Tree')
plt.show()

In [None]:
from sklearn.tree import plot_tree

In [None]:
# Plotting one of the decision trees from the random forest
plt.figure(figsize=(10, 6))
plot_tree(random_forest.estimators_[0], feature_names=features, filled=True)
plt.title('Random Forest - Decision Tree')
plt.show()