<a href="https://colab.research.google.com/github/Diwansu-pilania/streamlit-ml-app/blob/main/trainit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing Library

In [None]:
import pandas as pd

###Importing And Encoding the data

In [None]:
# Attempt to read the file with a different encoding

encodings = ["latin1", "ISO-8859-1", "utf-16", "cp1252"]
for enc in encodings:
    try:
        df = pd.read_csv('/content/data.csv', encoding=enc)
        break  # Stop if successful

    except UnicodeDecodeError:
        continue

  df = pd.read_csv('/content/data.csv', encoding=enc)


In [None]:
df

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",4.8,17.4,,,,,1990-02-01
1,151.0,February - M021990,Andhra Pradesh,Hyderabad,,Industrial Area,3.1,7.0,,,,,1990-02-01
2,152.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.2,28.5,,,,,1990-02-01
3,150.0,March - M031990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.3,14.7,,,,,1990-03-01
4,151.0,March - M031990,Andhra Pradesh,Hyderabad,,Industrial Area,4.7,7.5,,,,,1990-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,22.0,50.0,143.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-24
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,20.0,46.0,171.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-29
435739,,,andaman-and-nicobar-islands,,,,,,,,,,
435740,,,Lakshadweep,,,,,,,,,,


###Doing neccesary Formating &  Handling the Nan values

In [None]:
# Convert date column to datetime format

df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Extract useful date features

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop unnecessary columns that are mostly missing or not useful for prediction

df = df.drop(columns=['stn_code', 'sampling_date', 'agency', 'location_monitoring_station', 'date'])

# Handle missing values: Fill numeric columns with median and categorical with mode

for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].fillna(df[col].median())


for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
df

Unnamed: 0,state,location,type,so2,no2,rspm,spm,pm2_5,year,month,day
0,Andhra Pradesh,Hyderabad,"Residential, Rural and other Areas",4.8,17.4,90.0,187.0,32.0,1990.0,2.0,1.0
1,Andhra Pradesh,Hyderabad,Industrial Area,3.1,7.0,90.0,187.0,32.0,1990.0,2.0,1.0
2,Andhra Pradesh,Hyderabad,"Residential, Rural and other Areas",6.2,28.5,90.0,187.0,32.0,1990.0,2.0,1.0
3,Andhra Pradesh,Hyderabad,"Residential, Rural and other Areas",6.3,14.7,90.0,187.0,32.0,1990.0,3.0,1.0
4,Andhra Pradesh,Hyderabad,Industrial Area,4.7,7.5,90.0,187.0,32.0,1990.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
435737,West Bengal,ULUBERIA,RIRUO,22.0,50.0,143.0,187.0,32.0,2015.0,12.0,24.0
435738,West Bengal,ULUBERIA,RIRUO,20.0,46.0,171.0,187.0,32.0,2015.0,12.0,29.0
435739,andaman-and-nicobar-islands,Guwahati,"Residential, Rural and other Areas",8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0
435740,Lakshadweep,Guwahati,"Residential, Rural and other Areas",8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0


###Convert categorical variables into numerical format using label encoding


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = encoder.fit_transform(df[col])

In [None]:
df

Unnamed: 0,state,location,type,so2,no2,rspm,spm,pm2_5,year,month,day
0,0,114,6,4.8,17.4,90.0,187.0,32.0,1990.0,2.0,1.0
1,0,114,1,3.1,7.0,90.0,187.0,32.0,1990.0,2.0,1.0
2,0,114,6,6.2,28.5,90.0,187.0,32.0,1990.0,2.0,1.0
3,0,114,6,6.3,14.7,90.0,187.0,32.0,1990.0,3.0,1.0
4,0,114,1,4.7,7.5,90.0,187.0,32.0,1990.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
435737,35,282,3,22.0,50.0,143.0,187.0,32.0,2015.0,12.0,24.0
435738,35,282,3,20.0,46.0,171.0,187.0,32.0,2015.0,12.0,29.0
435739,36,100,6,8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0
435740,17,100,6,8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0


###Handling the outliners

In [None]:
import numpy as np

# Convert NO₂ column to numeric (in case of any string issues)
df['no2'] = pd.to_numeric(df['no2'], errors='coerce')

# Compute IQR for NO₂
Q1 = df['no2'].quantile(0.25)
Q3 = df['no2'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df[(df['no2'] >= lower_bound) & (df['no2'] <= upper_bound)]

# Display the number of removed outliers and remaining records
outliers_removed = df.shape[0] - df_cleaned.shape[0]
df_cleaned.shape[0], outliers_removed


(416409, 19333)

In [None]:

df_cleaned

Unnamed: 0,state,location,type,so2,no2,rspm,spm,pm2_5,year,month,day
0,0,114,6,4.8,17.4,90.0,187.0,32.0,1990.0,2.0,1.0
1,0,114,1,3.1,7.0,90.0,187.0,32.0,1990.0,2.0,1.0
2,0,114,6,6.2,28.5,90.0,187.0,32.0,1990.0,2.0,1.0
3,0,114,6,6.3,14.7,90.0,187.0,32.0,1990.0,3.0,1.0
4,0,114,1,4.7,7.5,90.0,187.0,32.0,1990.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
435737,35,282,3,22.0,50.0,143.0,187.0,32.0,2015.0,12.0,24.0
435738,35,282,3,20.0,46.0,171.0,187.0,32.0,2015.0,12.0,29.0
435739,36,100,6,8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0
435740,17,100,6,8.0,22.0,90.0,187.0,32.0,2010.0,6.0,15.0


###Creating and Evaluatng the Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define features (X) and target (y)
X = df_cleaned.drop(columns=['no2'])
y = df_cleaned['no2']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2


(3.1067335249245795, 0.8200601827853435)

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([()])

In [None]:
import joblib
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [None]:
# Load the model
loaded_model = joblib.load('random_forest_model.pkl')  # Load the model from the file

In [None]:
!pip install anvil-uplink

Collecting anvil-uplink
  Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting argparse (from anvil-uplink)
  Downloading argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting ws4py-sslupdate (from anvil-uplink)
  Downloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Downloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ws4py-sslupdate, argparse, anvil-uplink
Successfully installed anvil-uplink-0.5.2 argparse-1.4.0 ws4py-sslupdate-0.5.1b0


In [None]:
import anvil.server

anvil.server.connect('server_L7EIRYI3JO3QT7YJVOWLDZJ4-RJCW2F56LQQJR3O5')

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER


In [None]:
@anvil.server.callable
def predict_no2(stn_code,sampling_date, state, location, agency, type, so2, rspm, spm, location_monitoring_station, pm2_5, date):
    # Prepare the input data for prediction
    input_data = np.array([[stn_code,sampling_date, state, location, agency, type , so2, rspm, spm, location_monitoring_station, pm2_5, date]])

    # Make the prediction
    prediction = model.predict(input_data)

    # Return the predicted NO2 level
    return prediction[0]  # Assuming the model returns an array

In [None]:
anvil.server.wait_forever()



KeyboardInterrupt: 

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
from google.colab import files
files.download('model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>