In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('/home/brandon/IU International University/Project: From Model to Production/opp-stops_state.csv')

# Display the first few rows of the dataframe
data.head()


In [None]:
# Summary of the data
summary = data.describe(include='all').transpose()
summary['missing_values'] = data.isnull().sum()
summary


In [None]:
# Descriptive statistics for numerical columns
data[['search_rate', 'stop_rate', 'hit_rate', 'inferred_threshold', 'stops_per_year', 'stop_rate_n']].describe()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the plots
sns.set_style("whitegrid")

# Create subplots
fig, axes = plt.subplots(3, 1, figsize=(15, 20))

# Distribution of stops by race
sns.countplot(ax=axes[0], data=data, x='subject_race')
axes[0].set_title('Distribution of Stops by Race')

# Stop rate by race
sns.boxplot(ax=axes[1], data=data, x='subject_race', y='stop_rate')
axes[1].set_title('Stop Rate by Race')

# Search rate by race
sns.boxplot(ax=axes[2], data=data, x='subject_race', y='search_rate')
axes[2].set_title('Search Rate by Race')

plt.tight_layout()
plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(3, 1, figsize=(15, 20))

# Total stops per state
total_stops_per_state = data.groupby('state')['stops_per_year'].sum().sort_values(ascending=False)
sns.barplot(ax=axes[0], x=total_stops_per_state.index, y=total_stops_per_state.values)
axes[0].set_title('Total Stops per State')
axes[0].set_xlabel('State')
axes[0].set_ylabel('Total Stops per Year')

# Average stop rate per state
avg_stop_rate_per_state = data.groupby('state')['stop_rate'].mean().sort_values(ascending=False)
sns.barplot(ax=axes[1], x=avg_stop_rate_per_state.index, y=avg_stop_rate_per_state.values)
axes[1].set_title('Average Stop Rate per State')
axes[1].set_xlabel('State')
axes[1].set_ylabel('Average Stop Rate')

# Average search rate per state
avg_search_rate_per_state = data.groupby('state')['search_rate'].mean().sort_values(ascending=False)
sns.barplot(ax=axes[2], x=avg_search_rate_per_state.index, y=avg_search_rate_per_state.values)
axes[2].set_title('Average Search Rate per State')
axes[2].set_xlabel('State')
axes[2].set_ylabel('Average Search Rate')

plt.tight_layout()
plt.show()


In [None]:
# Scatter plot of stop rate vs search rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='stop_rate', y='search_rate')
plt.title('Stop Rate vs Search Rate')
plt.xlabel('Stop Rate')
plt.ylabel('Search Rate')
plt.show()

# Calculate the Pearson correlation coefficient
data[['stop_rate', 'search_rate']].corr()


In [None]:
# Box plot of hit rate by race
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='subject_race', y='hit_rate')
plt.title('Hit Rate by Race')
plt.xlabel('Race')
plt.ylabel('Hit Rate')
plt.show()


In [None]:
# Top 10 counties with the highest number of stops
top_stops_counties = data.groupby('subgeography')['stops_per_year'].sum().nlargest(10)

# Top 10 counties with the highest stop rate
top_stoprate_counties = data.groupby('subgeography')['stop_rate'].mean().nlargest(10)

# Top 10 counties with the highest search rate
top_searchrate_counties = data.groupby('subgeography')['search_rate'].mean().nlargest(10)

# Average hit rate for these counties
avg_hitrate_top_stops = data[data['subgeography'].isin(top_stops_counties.index)]['hit_rate'].mean()
avg_hitrate_top_stoprate = data[data['subgeography'].isin(top_stoprate_counties.index)]['hit_rate'].mean()
avg_hitrate_top_searchrate = data[data['subgeography'].isin(top_searchrate_counties.index)]['hit_rate'].mean()

(top_stops_counties, top_stoprate_counties, top_searchrate_counties, avg_hitrate_top_stops, avg_hitrate_top_stoprate, avg_hitrate_top_searchrate)


In [None]:
# Scatter plot of stop rate vs hit rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='stop_rate', y='hit_rate')
plt.title('Stop Rate vs Hit Rate')
plt.xlabel('Stop Rate')
plt.ylabel('Hit Rate')
plt.show()

# Calculate the Pearson correlation coefficient
stop_hit_corr = data[['stop_rate', 'hit_rate']].corr()

# Scatter plot of search rate vs hit rate
plt


<h1>Key Findings</h1>

***States and Counties:***

The dataset covers 21 unique states with 1062 unique subgeographies or counties.
The county with the most records is Jefferson County.
Missing Values:

The fields search_rate, hit_rate, and inferred_threshold have many missing values, which may limit the insights that can be derived from these fields.


***Stops by Race:***

The most stops were made for individuals identified as 'white', followed by 'hispanic' and 'black'.


***Stop Rate by Race:***

The median stop rate appears to be highest for 'black', followed by 'hispanic' and 'white'.

***Search Rate by Race:***

The median search rate appears to be slightly higher for 'black' and 'hispanic' compared to 'white'.

***Stops by State:***

Some states have a significantly higher total number of stops per year than others.
The state with the highest total number of stops per year appears to be California (CA), followed by Texas (TX) and Florida (FL).


***Rate Variations across States:***

The stop rate and search rate vary considerably across states.
The states with the highest average stop rate and search rate are not necessarily the ones with the highest total number of stops.

***Correlation between Rates:***

There is no strong linear relationship between the stop rate and the search rate or between the stop rate and the hit rate.
The same is true for the search rate and the hit rate.

***Stops by County:***

The counties with the highest total number of stops per year include Los Angeles County, San Diego County, Orange County, San Bernardino County, and Alameda County, among others.
The average hit rate for these counties is approximately 0.25.
Rate Variations across Counties:

The counties with the highest average stop rate and search rate do not necessarily align with the counties with the highest total number of stops.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define numerical and categorical columns
numerical_cols = ['search_rate', 'stop_rate', 'hit_rate', 'inferred_threshold', 'stops_per_year', 'stop_rate_n']
categorical_cols = ['subject_race', 'state', 'subgeography']

# Preprocessing for numerical columns (fill missing values with median)
num_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical columns (fill missing values with most frequent value and then one-hot encode)
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

# Define your target variable and features
# Define your target variable and features
y = data['subject_race']  # replace 'subject_race' with the name of your target column
X = data.drop('subject_race', axis=1)  # replace 'subject_race' with the name of your target column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)






In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Define numerical and categorical columns
numerical_cols = ['search_rate', 'hit_rate', 'inferred_threshold', 'stops_per_year']
categorical_cols = ['subject_race', 'state', 'subgeography']

# Preprocessing for numerical columns (fill missing values with median)
num_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical columns (fill missing values with most frequent value and then one-hot encode)
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

# Define your target variable and features
y = data['stop_rate']  # replace 'stop_rate' with the name of your target column
X = data.drop('stop_rate', axis=1)  # replace 'stop_rate' with the name of your target column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = my_pipeline.score(X_test, y_test)
print(f'Random Forest Regressor R^2 score: {score}')


In [None]:
import os
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set the MLflow tracking server to a backend
mlflow.set_tracking_uri('http://127.0.0.1:5000')  # replace 'http://my-server:5000' with your backend server

# Start a new MLflow run
with mlflow.start_run():
    # Preprocessing of training data, fit model 
    my_pipeline.fit(X_train, y_train)

    # Preprocessing of validation data, get predictions
    preds = my_pipeline.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    # Log model
    mlflow.sklearn.log_model(my_pipeline, "model")

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    # Log parameter
    mlflow.log_param("model_type", "Random Forest Regressor")

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")



In [None]:
from prefect import Flow, task, context, logging, flow
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

@task
def load_data():
    # Load your data here
    # X_train, X_test, y_train, y_test should be defined here
    return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

@task
def train_model(data):
    my_pipeline.fit(data['X_train'], data['y_train'])
    return my_pipeline

@task
def make_predictions(model, data):
    preds = model.predict(data['X_test'])
    return preds

@task
def calculate_metrics(data, preds):
    rmse = np.sqrt(mean_squared_error(data['y_test'], preds))
    mae = mean_absolute_error(data['y_test'], preds)
    r2 = r2_score(data['y_test'], preds)
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

@task
def log_metrics(metrics):
    logger = logging.get_run_logger()
    logger.info(f"RMSE: {metrics['rmse']}")
    logger.info(f"MAE: {metrics['mae']}")
    logger.info(f"R2: {metrics['r2']}")
    
@flow(name="End-to-end Flow")
def run_flow_fn():
    data = load_data()
    model = train_model(data)
    preds = make_predictions(model, data)
    metrics = calculate_metrics(data, preds)
    log = log_metrics(metrics)
        
# Call the function to run the flow
run_flow_fn()




In [None]:
#Important IPs
#Prometheus: localhost:9090 #Don't forget to cd into the Prometheus folder
#MLFlow: localhost:5001
#Grafana: localhost:3000

#start the Grafana server
#sudo service grafana-server start

#start the Prometheus server
#./prometheus --config.file=prometheus.yaml

#Starts the Prefect Server
#prefect server start


#Gradio IP
#http://127.0.0.1:7861/

In [None]:
#Create the Gradio App
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Define numerical and categorical columns
numerical_cols = ['search_rate', 'hit_rate', 'inferred_threshold', 'stops_per_year']
categorical_cols = ['subject_race', 'state', 'subgeography']

# Preprocessing for numerical columns (fill missing values with median)
num_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical columns (fill missing values with most frequent value and then one-hot encode)
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

# Define your target variable and features
y = data['stop_rate']  # replace 'stop_rate' with the name of your target column
X = data.drop('stop_rate', axis=1)  # replace 'stop_rate' with the name of your target column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

def predict(search_rate, hit_rate, inferred_threshold, stops_per_year, subject_race, state, subgeography):
    df = pd.DataFrame({
        'search_rate': [search_rate], 
        'hit_rate': [hit_rate],
        'inferred_threshold': [inferred_threshold],
        'stops_per_year': [stops_per_year],
        'subject_race': [subject_race],
        'state': [state],
        'subgeography': [subgeography]
    })
    preds = my_pipeline.predict(df)
    return preds[0]

# Define the Gradio interface
iface = gr.Interface(
    fn=predict, 
    inputs=[
        gr.inputs.Number(label=numerical_cols[0]),
        gr.inputs.Number(label=numerical_cols[1]),
        gr.inputs.Number(label=numerical_cols[2]),
        gr.inputs.Number(label=numerical_cols[3]),
        gr.inputs.Dropdown(choices=data[subject_race].unique().tolist(), label=categorical_cols[0]),
        gr.inputs.Dropdown(choices=data[state].unique().tolist(), label=categorical_cols[1]),
        gr.inputs.Dropdown(choices=data[subgeography].unique().tolist(), label=categorical_cols[2])
    ], 
    outputs='number',
)

# Launch the Gradio interface
iface.launch(share=)

