Reading Data

In [1]:
from pathlib import Path
import pandas as pd

BASE_DIR = Path.cwd().parent.parent
DATA_PATH = BASE_DIR / "media" / "data.xlsx"
df = pd.read_excel(DATA_PATH)
df

Unnamed: 0,year,month,day,hour,generation,temp,feelslike,dew,humidity,precip,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,2022,6,22,0,0.0,12.4,12.4,9.2,80.83,0.0,...,1007.9,91.1,10.0,0,0.0,0,10,Overcast,cloudy,33177099999.3339
1,2022,6,22,1,0.0,12.5,12.5,9.0,79.23,0.0,...,1007.0,95.7,24.1,0,0.0,0,10,Overcast,cloudy,remote
2,2022,6,22,2,0.0,12.7,12.7,9.0,78.19,0.0,...,1007.0,95.3,24.1,0,0.0,0,10,Overcast,cloudy,remote
3,2022,6,22,3,0.0,10.9,10.9,8.1,82.85,0.0,...,1009.8,80.0,10.0,0,0.0,0,10,Partially cloudy,partly-cloudy-night,33393099999
4,2022,6,22,4,0.0,12.2,12.2,8.8,79.72,0.0,...,1007.0,97.1,20.8,0,0.0,0,10,Overcast,cloudy,remote
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13052,2024,2,23,19,0.0,9.6,7.4,3.8,67.13,0.0,...,1007.0,98.4,24.1,0,0.0,0,10,Overcast,cloudy,
13053,2024,2,23,20,0.0,9.1,6.7,3.7,68.94,0.0,...,1008.0,94.0,24.1,0,0.0,0,10,Overcast,cloudy,
13054,2024,2,23,21,0.0,8.6,6.3,3.4,69.81,0.0,...,1008.0,78.2,24.1,0,0.0,0,10,Partially cloudy,partly-cloudy-night,
13055,2024,2,23,22,0.0,7.9,5.7,3.3,72.70,0.0,...,1009.0,87.1,24.1,0,0.0,0,10,Partially cloudy,partly-cloudy-night,


Checking null values for each column


In [2]:
print(df.isnull().sum())
print(df.shape) #13057 rows present

year                    0
month                   0
day                     0
hour                    0
generation              3
temp                    0
feelslike               0
dew                     0
humidity                0
precip                  0
precipprob              0
preciptype          10375
snow                    0
snowdepth               0
windgust                0
windspeed               0
winddir                 0
sealevelpressure        0
cloudcover              0
visibility              0
solarradiation          0
solarenergy             0
uvindex                 0
severerisk              0
conditions              0
icon                    0
stations              569
dtype: int64
(13057, 27)


 Null values are present in preciptype,stations and generation columns.
 Drop null rows that have no station value.
 Fill null values in preciptype with "None".


In [3]:
df.fillna({"preciptype": "None"}, inplace=True)
df.dropna(inplace=True)

Recheck null values

In [4]:

print(df.isnull().sum())                           # No null values present
print(df.shape)                                    # currently 12485 rows present

year                0
month               0
day                 0
hour                0
generation          0
temp                0
feelslike           0
dew                 0
humidity            0
precip              0
precipprob          0
preciptype          0
snow                0
snowdepth           0
windgust            0
windspeed           0
winddir             0
sealevelpressure    0
cloudcover          0
visibility          0
solarradiation      0
solarenergy         0
uvindex             0
severerisk          0
conditions          0
icon                0
stations            0
dtype: int64
(12485, 27)


 Considering maximum power of each station as system size


In [5]:
system_size = df.groupby("stations")["generation"].max()
df["system size"]=df["stations"].map(system_size)

 Converting negative generation values to zero


In [6]:
df["generation"] = df["generation"].clip(lower=0)


Using normalized power=power/system size


In [7]:
df["normalized power"]=df["generation"]/df["system size"]


Exporting processed data to csv file


In [8]:
# df.to_csv(BASE_DIR / "media" / "ProcessedData.csv", index=False)
# final output:predict normalized power using weather data and multiply it with user's system size

Importing Libraries


In [9]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Plotting Year v/s available Data


In [10]:
import plotly.io as pio
pio.renderers.default = "vscode"


In [11]:
year_counts = (
    df["year"]
    .value_counts()
    .sort_index()
    .reset_index()
)
# Bar plot
fig = px.bar(
    year_counts,
    x="year",
    y="count",
    title="Year vs Number of Data Points",
    labels={"year": "Year", "count": "Number of Records"},
)
fig.show()

Splitting Data

In [12]:
train_df=df[((df['year']==2022) | ((df['year']==2023) & (df['month']<=7)))]
val_df=df[(df["year"]==2023) & ((df["month"]>7) & (df["month"]<=8))]
test_df=df[((df["year"]==2023) & (df["month"]>=9))]
print(df.shape)
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)


(12485, 29)
(9717, 29)
(744, 29)
(1993, 29)


Plotting Relations

In [13]:
train_df.describe()

Unnamed: 0,year,month,day,hour,generation,temp,feelslike,dew,humidity,precip,...,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,system size,normalized power
count,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,...,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0,9717.0
mean,2022.523207,6.551713,15.973449,11.496964,135.249357,10.70459,9.484512,6.207996,76.842535,0.097096,...,193.829093,1016.927776,73.900051,19.625399,143.348359,0.515457,1.424617,10.042812,859.890604,0.157502
std,0.499487,3.277216,8.84944,6.921449,223.531211,8.722086,10.036863,6.956699,18.043549,0.850386,...,101.964522,8.044374,32.279159,7.794165,230.500514,0.830623,2.326615,2.611849,10.426707,0.260421
min,2022.0,1.0,1.0,0.0,0.0,-13.9,-18.7,-15.1,22.17,0.0,...,0.0,989.0,0.0,0.0,0.0,0.0,0.0,3.0,772.0,0.0
25%,2022.0,4.0,8.0,5.0,0.0,2.9,0.6,0.4,65.71,0.0,...,101.0,1012.0,55.6,15.5,0.0,0.0,0.0,10.0,852.0,0.0
50%,2023.0,7.0,16.0,11.0,5.0,10.8,10.8,6.3,82.84,0.0,...,206.5,1017.0,90.0,24.1,7.0,0.0,0.0,10.0,864.0,0.005787
75%,2023.0,9.0,24.0,17.0,171.0,17.4,17.4,11.9,90.97,0.0,...,283.0,1021.6,100.0,24.1,206.0,0.7,2.0,10.0,864.0,0.201389
max,2023.0,12.0,31.0,23.0,864.0,33.9,33.6,21.5,100.0,28.545,...,360.0,1048.5,100.0,50.0,928.0,3.3,9.0,75.0,864.0,1.0


"Explaination for each column":

(1)generation(KWH): target column, defines actual solar power production during an entire hour

(2)feelslike temperature(celcius):probably useless cause we already have real temperature column and feels like temp is just for humans(machines don't feel)

(3)dew(celcius):Denotes the temperature at which air turns into dew. Higher the dew point is the more moisture air contains and dew moisture in air the sunlight is more scattered.
Calculated from:
                Temperature
                Relative humidity
                Pressure.
Effect on solar panels:
                High dew → condensation on panels (early morning)
                Can slightly reduce early irradiance

(4)humidity (relative humidity %):Denotes % of water vapor present compared to maximum possible at that temperature.
Effect on solar:
                High humidity → more scattering of sunlight
                Slight efficiency reduction

(5)precip (mm):Any water falling from sky(if its snow, it will be converted to its equivalent measure in liquid form).
Inshort it represents Depth of water accumulated on flat ground.
                1 mm precip = 1 liter per m²
Effect on solar:
                Clouds reduce radiation
                Rain can clean panels afterward

(6)precipprob (precipitation probability %):Chance of precipitation occurring in that time window
                Weak predictor

(7)preciptype:preciptype has these 5 values:None,rain,freezingrain,snow ,(rain,snow)
Why this matters:
                Snow blocks panels 
                Rain reduces radiation temporarily
                Freezing rain can be damaging
                could be onehotencoded-->Might prove really useful

(8)snow (mm):Fresh snowfall during the period.

(9)snowdepth(mm):Total snow already on ground.
                Snow depth > 0 → panels blocked → near-zero generation

(10)windgust(km/h):maximum short burst speed during interval
                Might not prove to be major contributor to the generation

(11)windspeed(km/h):average sustained wind
                Very important factor
            
(12)winddir (degrees):direction from which wind comes
                        0° / 360° → North
                        90° → East
                        180° → South
                        270° → West

(13)sealevelpressure(hpa):ressure of air normalized to sea level
                            1hpa=100pa
                            normalized sea level pressure is 1013hpa
    Why it matters:
                            Low pressure → clouds, storms
                            High pressure → clear skies ☀️

(14)cloudcover (%):fraction of sky covered by clouds
                    0% → clear sky
                    100% → overcast
    Effect on solar:
                    One of the strongest predictors
                    Directly blocks irradiance

(15)visibility (km):maximum distance you can see clearly
Low visibility means:
                    Fog
                    Haze
                    Heavy rain/snow
                    Low visibility → high scattering → low radiation

(16)solarradiation (W/m²):instantaneous power from sun per unit area. Also known as GHI.
                                            MOST IMPORTANT FEATURE

(17)solarenergy(kWh):integrated solar radiation over time
                        solarenergy ≈ solarradiation × time

(18)uvindex:strength of ultraviolet radiation.
Calculated using:
                UV irradiance
                Weighted by biological effect on skin

(19)conditions:Human-readable weather summary
                Probably drop cause we already have preciptype and snow columns

(20)icon:UI symbol mapping. Zero scientific value.
                Drop

(21)stations:The station code where the energy is generated.
                Used to groupby data station wise and find maximum output of each station so that it can be considered asa system size.
            

Solar Radiation(W/m^2) vs Solar Energy Generation(kWh):

In [14]:
fig=px.scatter(train_df,x="solarradiation",y="generation",color="preciptype",
               title="Solar Radiation(W/m^2) vs Solar Energy Generation(kWh)",
               labels={
        "solarradiation": "Solar Radiation (W/m^2)",
        "generation": "Generation(kWh)",
    })
fig.show()
# In following graph:
# (1)i can observe that in most of the cases when there is preciptype not None the solar radiation is low.
# (2)When there is preciptype=None there is a little linear correlation present between solarradiation and generation,
# although data is scattered alot so i think linear regression would not be so appropriate to use ,might go for RandforestRegressor.
# But i should definetly give linear regression a try.
# (3)majority of points lie near the origin . Probably cause most of the time solar generation is zero due to night time.
# (4)from graph i can analyse that maximum solar generation is when preciptype=None
# After that order is like this(Note:These are just my assumption based on the look at the graph):
#                               preciptype=rain(linearity is broken)
#                               preciptype=snow(linearity is broken)
#                               preciptype=rain,snow(linearity is broken)
#                               preciptype=freezingrain(linearity is broken)




In [15]:
train_df["solarradiation"].corr(train_df["generation"])
# correlation coffecient isn't bad. Linear regression may work


np.float64(0.7704274356932143)

Solar Energy(kWh) vs Generation(kWh):

In [16]:
fig=px.scatter(train_df,x="solarenergy",y="generation",color="preciptype",title="Solar Energy(kWh) vs Generation(kWh)",
               labels={
        "solarenergy": "Solar Energy(kWh)",
        "generation": "Generation(kWh)",
    })
fig.show()
# In following graph:
# following graph is almost unreadable cause of overlapping of points
# i may not be able to conclude anything from this.




In [17]:
train_df['solarenergy'].corr(train_df["generation"])
# correlation value doesn't look bad(graphically it is awful to judge:i guess most of the 
# point lies in linear tred overlapping each other )-->linear regression maybe useful

np.float64(0.77027879839433)

Temperature(°C) vs Generation(kWh)

In [18]:
fig=px.scatter(train_df,x="temp",y="generation",color="preciptype",title="Temperature(°C) vs Generation(kWh)",
               labels={
        "temp": "Temperature(°C)",
        "generation": "Generation(kWh)",
    } )
fig.show()
# Observations:
# (1)Temperature is generally (0-20 degree celcius)-->Model would probably work better on europeon countries rather asia.
# (2)Graphically i can see exponential increase(not complete exponential but a little) in solar generation with increase in temperature.
# but since most of the points lie at the botton(cause generation at night is 0)  that is why 
# the avg value would be around at vertical center which will result in some short of linear relation

In [19]:
train_df["temp"].corr(train_df["generation"])
# correlation value is not bad.
# not as good as solarradiation and solarenergy still it is good enough to try a linear model.

np.float64(0.47530724691759096)

Humidity(%) vs Generation(kWh):

In [20]:
# from first graph of the following i can state that generation is usually low when humidity is very high.
# humidity is high when there is rain or snow.
# we can conclude that humidity does affect generation,the reason for that could be that humidity corelates to 
# rain or other reason might be that when there is too much moisture in the air the solar radiation is scattered
# cause of that light concerntration on solar panel is reduced
fig=px.scatter(train_df,x="humidity",y="generation",color="preciptype",title="Humidity(%) vs Generation(kWh)",
    labels={
        "generation": "Generation(kWh)",
        "humidity": "Humidity(%)"
    })
fig.show()
# from the second graph of following we can clearly state that when temperature is 
# low and humidity is high generation is most of the time zero and this is very important conclusion.
fig = px.scatter(
    train_df,
    x="temp",
    y="humidity",
    color="generation",
    color_continuous_scale="Viridis",
    opacity=0.6,
    labels={
        "temp": "Temperature(°C)",
        "generation": "Generation(kWh)",
        "humidity": "Humidity(%)"
    },
    title="Temperature(celcius) vs Humidity(%) (with generation)"
)
fig.show()



In [21]:
train_df["humidity"].corr(train_df["generation"])
# not bad at all

np.float64(-0.580917488053572)

Dew Point(°C) vs Generation(kWh):

In [22]:
# I don't think this is gonna be major factor but let's give it a try by plotting
fig=px.scatter(train_df,x="dew",y="generation",title="Dew(°C) vs Generation(kWh)",
    labels={
        "dew": "Dew Point(°C)",
        "generation": "Generation(kWh)",
    })
fig.update_traces(marker=dict(color="red"))
fig.show()
# not much to observe,but i can say when dew point is in negative(hence pressure must be low),the generation is low.


Box Plot For Preciptype vs Generation(kWh)

In [23]:
train_df["dew"].corr(train_df["generation"])
# weak linar relation but not too weak

np.float64(0.21324715617913387)

In [24]:
px.box(
    train_df,
    x="preciptype",
    y="generation",
    title="Generation under Different Precipitation Types"
)
# from following graph i can that i was right when i had assumed the order of max solar generation according to
#  preciptype.(None>rain>snow>snow,rain,freezingrain)
# i can see in the following data median for rain and freezingrain is higher than None,and yes that is possible,when
#  preciptype might have been at that time there might have been more nights(generation=0),and when preciptype might have been 
# either rain or freezing rain there might have been less number of nights hence less number of zero power generation. It 
# really doesn't affect our model.


Wind Speed(Km/h) vs Generation(kWh)

In [25]:
fig=px.scatter(train_df,x="windspeed",y="generation",title="Wind Speed(km/h) vs Generation(kWh)",
    labels={
        "windspeed": "Wind Speed(km/h)",
        "generation": "Generation(kWh)",
    })
fig.update_traces(marker=dict(color="green"))
fig.show()
# We conclude that there is not much relation between wind speed and generation in this dataset.
# Although we can see that when wind speed is high generation is low the reason might be (or probably cause of night).

In [26]:
train_df["windspeed"].corr(train_df["generation"])
# disappointing,i thought wind speed will be a good measure for linear regression in producing solar power before studying the graph but it 
# turns out it is not that significant.

np.float64(0.2222220357275538)

Sealevelpressure(hpa) vs Generation(kWh):

In [27]:
fig=px.scatter(train_df,x="sealevelpressure",y="generation",title="Sealevelpressure(hpa) vs Generation(kWh)",
    labels={
        "sealevelpressure": "Sealevelpressure(hpa)",
        "generation": "Generation(kWh)",
    })
fig.show()
# From the following graph i can observe that when sealevelpressure is normal(1013hpa) or slightly higher(close-->till 1028hpa),
# the solar generation is really high but when sealevelpressure is lower than normal value or significantly higher than normal value
# then solar generation is really low. The reason for that can be interpreted as:
# (1)when sealevelpressure is lower than normal then it indicates storm or other severe weather  conditions and cause of 
# that solar output is low
# (2)when sealevelpressure is high it should generally not produce low output cause high pressure generally meanas clear sky.
# *But Not Always*
# Strong high-pressure systems often cause:
#       Subsidence inversion (warm air above cool air)
#           Trapping of:
#                   haze
#                   dust
#                   pollution
#                   moisture near surface
# This leads to:
        # Smog / haze
        # Thin but widespread cloud layers
        # Reduced solar irradiance


In [28]:
train_df["sealevelpressure"].corr(train_df["generation"])
# very very weak-->not at all linear but can be really useful in randomforest regression

np.float64(-0.038460123139229904)

Cloud Cover(%) vs Generation(kWh):

In [29]:
fig=px.scatter(train_df,x="cloudcover",y="generation",title="Cloud Cover(%) vs Generation(kWh)",
    labels={
        "cloudcover": "Cloud Cover(%)",
        "generation": "Generation(kWh)",
    })
fig.show()
# Well from following graph i can clearly say that cloud cover is weak indicator.
# At first i thought it will be a great indicator,but then i got to understand,as we know data is recorded at night as well so there is
# zero solar generation even when cloud cover is low.
# other reason is that cloud cover is usually estimated, not measured precisely
# Often rounded to fixed buckets (0, 25, 50, 75, 100)(that is why i see vertical line shaped structure at cloud cover=[0,30,40,60,80,100])
# Describes fraction of sky covered, not cloud thickness
# A sky with:
        # 80% thin cirrus clouds
        # can still produce high GHI

# A sky with:
        # 30% thick cumulonimbus
        # can produce near-zero generation
# The relationship between cloud cover and solar generation is highly scattered and non-monotonic. While heavy cloud cover is generally
#  associated with lower generation, moderate to high cloud cover can still coincide with high output due to variations in cloud thickness,
#  cloud type, and increased diffuse radiation. This indicates that cloud cover percentage alone is an insufficient descriptor of cloud 
# impact on solar generation.

In [30]:
train_df["cloudcover"].corr(train_df["generation"])
# very weak as expected from graph

np.float64(-0.08255035893970698)

One Hot Encoding

In [31]:

train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)


def encode_precip(df, encoder):
    encoded = encoder.transform(df[["preciptype"]])
    encoded_df = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(["preciptype"]),
        index=df.index          # 🔑 THIS FIXES NaNs
    )
    return pd.concat([df.drop(columns=["preciptype"]), encoded_df], axis=1)


In [32]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(sparse_output=False,handle_unknown="ignore")
ohe.fit(train_df[["preciptype"]])

train_df = encode_precip(train_df, ohe)
val_df   = encode_precip(val_df, ohe)
test_df  = encode_precip(test_df, ohe)


In [33]:
train_df

Unnamed: 0,year,month,day,hour,generation,temp,feelslike,dew,humidity,precip,...,conditions,icon,stations,system size,normalized power,preciptype_None,preciptype_freezingrain,preciptype_rain,"preciptype_rain,snow",preciptype_snow
0,2022,6,22,0,0.0,12.4,12.4,9.2,80.83,0.000,...,Overcast,cloudy,33177099999.3339,852.0,0.000000,1.0,0.0,0.0,0.0,0.0
1,2022,6,22,1,0.0,12.5,12.5,9.0,79.23,0.000,...,Overcast,cloudy,remote,864.0,0.000000,1.0,0.0,0.0,0.0,0.0
2,2022,6,22,2,0.0,12.7,12.7,9.0,78.19,0.000,...,Overcast,cloudy,remote,864.0,0.000000,1.0,0.0,0.0,0.0,0.0
3,2022,6,22,3,0.0,10.9,10.9,8.1,82.85,0.000,...,Partially cloudy,partly-cloudy-night,33393099999,772.0,0.000000,1.0,0.0,0.0,0.0,0.0
4,2022,6,22,4,0.0,12.2,12.2,8.8,79.72,0.000,...,Overcast,cloudy,remote,864.0,0.000000,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9712,2023,7,31,19,104.0,23.1,23.1,15.5,62.29,0.000,...,Overcast,cloudy,remote,864.0,0.120370,1.0,0.0,0.0,0.0,0.0
9713,2023,7,31,20,48.0,22.3,22.3,15.4,64.97,0.000,...,Overcast,cloudy,remote,864.0,0.055556,1.0,0.0,0.0,0.0,0.0
9714,2023,7,31,21,10.0,20.6,20.6,16.4,77.02,0.222,...,"Rain, Partially cloudy",rain,33177099999.3339,852.0,0.011737,0.0,0.0,1.0,0.0,0.0
9715,2023,7,31,22,0.0,19.5,19.5,16.1,80.72,0.000,...,Partially cloudy,partly-cloudy-night,remote,864.0,0.000000,1.0,0.0,0.0,0.0,0.0


In [34]:
train_df.columns

Index(['year', 'month', 'day', 'hour', 'generation', 'temp', 'feelslike',
       'dew', 'humidity', 'precip', 'precipprob', 'snow', 'snowdepth',
       'windgust', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover',
       'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'severerisk',
       'conditions', 'icon', 'stations', 'system size', 'normalized power',
       'preciptype_None', 'preciptype_freezingrain', 'preciptype_rain',
       'preciptype_rain,snow', 'preciptype_snow'],
      dtype='object')

Dividing Between Input Columns(only important ones) and Target Columns:

In [35]:
input_cols=["temp","dew","humidity","windspeed","solarradiation","solarenergy","preciptype_None",
            "preciptype_freezingrain","preciptype_rain","preciptype_rain,snow","preciptype_snow",]
target_cols="normalized power"

Simple Function For Training Model(To Reduce Redundancy):

In [36]:
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,r2_score,mean_squared_error

def try_model(model,inputs):
    model.fit(train_df[inputs],train_df[target_cols])
    predictions=model.predict(val_df[inputs])
    print("Root Mean Squared Error:",root_mean_squared_error(val_df[target_cols],predictions))
    print("Mean Squared Error:",mean_squared_error(val_df[target_cols],predictions))
    print("Mean Absolute Error:",mean_absolute_error(val_df[target_cols],predictions))
    print("R2 score:",r2_score(val_df[target_cols],predictions))
    return model

Linear Regression:


In [37]:
from sklearn.linear_model import LinearRegression

#Linear Regression With default input columns
print("Linear Regression With default input columns")
try_model(LinearRegression(),input_cols)
print()

# Linear Regression With default input columns+precip
print("Linear Regression With default input columns+precip")
try_model(LinearRegression(),input_cols+["precip"])
print()
# conclusion:increases the error

#Linear Regression With default input columns+snow
print("Linear Regression With default input columns+snow")
try_model(LinearRegression(),input_cols+["snow"])
print()
# conclusion:increases the error

#Linear Regression With default input columns+snowdepth
print("Linear Regression With default input columns+snowdepth")
try_model(LinearRegression(),input_cols+["snowdepth"])
print()
# conclusion:increases the error

#Linear Regression With default input columns+snow+snowdepth
print("Linear Regression With default input columns+snow+snowdepth")
try_model(LinearRegression(),input_cols+["snowdepth","snow"])
print()
# conclusion:increases the error

#Linear Regression With default input columns+winddir
print("Linear Regression With default input columns+winddir")
try_model(LinearRegression(),input_cols+["winddir"])
print()
# conclusion:increases the error for linear model atleast!

#Linear Regression With default input columns+sealevelpressure
print("Linear Regression With default input columns+sealevelpressure")
try_model(LinearRegression(),input_cols+["sealevelpressure"])
print()
# conclusion:Doesn't increase the error-->but very weak increment to the performance

#Linear Regression With default input columns+cloudcover
print("Linear Regression With default input columns+cloudcover")
try_model(LinearRegression(),input_cols+["cloudcover"])
print()
# conclusion:Doesn't increase the error-->but very weak increment to the performance

#Linear Regression With default input columns+visibility
print("Linear Regression With default input columns+visibility")
try_model(LinearRegression(),input_cols+["visibility"])
print()
# conclusion:Doesn't increase the error-->but very very very weak increment to the performance

#Linear Regression With default input columns+uvindex
print("Linear Regression With default input columns+uvindex")
try_model(LinearRegression(),input_cols+["uvindex"])
print()
# conclusion:Increases error

#Linear Regression With default input columns+sealevelpressure+cloudcover
print("Linear Regression With default input columns+sealevelpressure+cloudcover")
try_model(LinearRegression(),input_cols+["sealevelpressure","cloudcover"])
print()
# conclusion:slight good increment(not much though)

#Linear Regression With default input columns+sealevelpressure+cloudcover+visibility
print("Linear Regression With default input columns+sealevelpressure+cloudcover+visibility")
try_model(LinearRegression(),input_cols+["sealevelpressure","cloudcover","visibility"])
print()
# conclusion:Best one yet! can't get better than this for linear regression i guess.

Linear Regression With default input columns
Root Mean Squared Error: 0.15637155567256225
Mean Squared Error: 0.024452063423457234
Mean Absolute Error: 0.10384913762636784
R2 score: 0.7383495312122629

Linear Regression With default input columns+precip
Root Mean Squared Error: 0.15642311759272112
Mean Squared Error: 0.024468191717426258
Mean Absolute Error: 0.10394367987652514
R2 score: 0.7381769496348042

Linear Regression With default input columns+snow
Root Mean Squared Error: 0.156384180516359
Mean Squared Error: 0.02445601191577316
Mean Absolute Error: 0.10385479236884926
R2 score: 0.7383072801822548

Linear Regression With default input columns+snowdepth
Root Mean Squared Error: 0.15635870323395928
Mean Squared Error: 0.02444804407700535
Mean Absolute Error: 0.10375334743490555
R2 score: 0.7383925404203262

Linear Regression With default input columns+snow+snowdepth
Root Mean Squared Error: 0.15637198179284145
Mean Squared Error: 0.024452196689820738
Mean Absolute Error: 0.10376

RandomForest Regressor:

In [38]:
from sklearn.ensemble import RandomForestRegressor

#RandomForest Regresor With default input columns
print("RandomForest Regressor With default input columns")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols)
print()

# RandomForest Regresor With default input columns+precip
print("RandomForest Regresor With default input columns+precip")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["precip"])
print()
# conclusion:improves model by very little amount

#RandomForest Regresor With default input columns+snow
print("RandomForest Regresor With default input columns+snow")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["snow"])
print()
# conclusion:increases the error

#RandomForest Regresor With default input columns+snowdepth
print("RandomForest Regresor With default input columns+snowdepth")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["snowdepth"])
print()
# conclusion:improves model by very little amount

#RandomForest Regresor With default input columns+snow+snowdepth
print("RandomForest Regresor With default input columns+snow+snowdepth")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["snowdepth","snow"])
print()
# conclusion:increases the error

#RandomForest Regresor With default input columns+winddir
print("RandomForest Regresor With default input columns+winddir")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["winddir"])
print()
# conclusion:improves model by little amount

#RandomForest Regresor With default input columns+sealevelpressure
print("RandomForest Regresor With default input columns+sealevelpressure")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["sealevelpressure"])
print()
# conclusion:improves model by very little amount

#RandomForest Regresor With default input columns+cloudcover
print("RandomForest Regresor With default input columns+cloudcover")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["cloudcover"])
print()
# conclusion:improves model by good amount

#RandomForest Regresor With default input columns+visibility
print("RandomForest Regresor With default input columns+visibility")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["visibility"])
print()
# conclusion:improves model by very little amount

#RandomForest Regresor With default input columns+uvindex
print("RandomForest Regresor With default input columns+uvindex")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["uvindex"])
print()
# conclusion:improves model by very little amount

#RandomForest Regresor With default input columns+sealevelpressure+cloudcover
print("RandomForest Regresor With default input columns+sealevelpressure+cloudcover")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["sealevelpressure","cloudcover"])
print()
# conclusion:improves model by good amount

#RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
print("RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip")
try_model(RandomForestRegressor(n_jobs=8,random_state=42),input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"])
print()
# Best one yet!

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221


RandomForest Regressor With default input columns
Root Mean Squared Error: 0.1652915049789969
Mean Squared Error: 0.027321281618221754
Mean Absolute Error: 0.09934364033266727
R2 score: 0.7076473253201314

RandomForest Regresor With default input columns+precip
Root Mean Squared Error: 0.1641199272051417
Mean Squared Error: 0.02693535050582101
Mean Absolute Error: 0.09834153498089865
R2 score: 0.7117769995619607

RandomForest Regresor With default input columns+snow
Root Mean Squared Error: 0.16550697576722567
Mean Squared Error: 0.02739255902761303
Mean Absolute Error: 0.09961903002498333
R2 score: 0.7068846180075326

RandomForest Regresor With default input columns+snowdepth
Root Mean Squared Error: 0.16580547927822664
Mean Squared Error: 0.027491456958682447
Mean Absolute Error: 0.09968458334711543
R2 score: 0.7058263559877462

RandomForest Regresor With default input columns+snow+snowdepth
Root Mean Squared Error: 0.1660409561303238
Mean Squared Error: 0.027569599112672108
Mean Abs

Trying HyperParameters:

(1)max_depth

In [39]:
# BASELINE:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New Models:
for i in range(1,15):
    print("Depth=",i)
    try_model(RandomForestRegressor( n_jobs=8,random_state=42,max_depth=i),
              input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
             )
    # Best Depth Value=5
    # After max_depth=5 model starts to overfit on train_df data.
# Depth= 5
# Root Mean Squared Error: 0.14707293409848013
# Mean Squared Error: 0.02163044794433588
# Mean Absolute Error: 0.08444597230491156
# R2 score: 0.7508610226137489




Depth= 1
Root Mean Squared Error: 0.18292434863710907
Mean Squared Error: 0.03346131732431063
Mean Absolute Error: 0.13000566783582904
R2 score: 0.6419455809294957
Depth= 2
Root Mean Squared Error: 0.1559058544662815
Mean Squared Error: 0.024306635456861347
Mean Absolute Error: 0.09697073538099274
R2 score: 0.7399056900924235
Depth= 3
Root Mean Squared Error: 0.15710969684382067
Mean Squared Error: 0.024683456842357236
Mean Absolute Error: 0.09794032225419785
R2 score: 0.7358734949170396
Depth= 4
Root Mean Squared Error: 0.15275852650455438
Mean Squared Error: 0.023335167419842644
Mean Absolute Error: 0.09234606023928447
R2 score: 0.7503009300726373
Depth= 5
Root Mean Squared Error: 0.15205578289520497
Mean Squared Error: 0.02312096111187371
Mean Absolute Error: 0.09048078523571322
R2 score: 0.7525930548690908
Depth= 6
Root Mean Squared Error: 0.1525496691114952
Mean Squared Error: 0.023271401546026676
Mean Absolute Error: 0.08997266702645251
R2 score: 0.7509832598412
Depth= 7
Root Mea

In [40]:
help(RandomForestRegressor())

Help on RandomForestRegressor in module sklearn.ensemble._forest object:

class RandomForestRegressor(ForestRegressor)
 |  RandomForestRegressor(
 |      n_estimators=100,
 |      *,
 |      criterion='squared_error',
 |      max_depth=None,
 |      min_samples_split=2,
 |      min_samples_leaf=1,
 |      min_weight_fraction_leaf=0.0,
 |      max_features=1.0,
 |      max_leaf_nodes=None,
 |      min_impurity_decrease=0.0,
 |      bootstrap=True,
 |      oob_score=False,
 |      n_jobs=None,
 |      random_state=None,
 |      verbose=0,
 |      warm_start=False,
 |      ccp_alpha=0.0,
 |      max_samples=None,
 |      monotonic_cst=None
 |  )
 |
 |  A random forest regressor.
 |
 |  A random forest is a meta estimator that fits a number of decision tree
 |  regressors on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  Trees in the forest use the best split strategy, i.e. equivalent to passing
 |  `splitter="best

(2)n_estimators

In [41]:
# Baseline:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New models:
for i in range(1,10):
    print("n_estimators=",25*i)
    try_model(RandomForestRegressor(n_jobs=8,random_state=42,n_estimators=25*i,),
              input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
             )
 

# n_estimators= 225
# Root Mean Squared Error: 0.152020766191822
# Mean Squared Error: 0.02311031335354861
# Mean Absolute Error: 0.08587965554941963
# R2 score: 0.7338159685460133
# n_estimators= 450
# Root Mean Squared Error: 0.1519663395439153
# Mean Squared Error: 0.023093768354376557
# Mean Absolute Error: 0.08586206806048743
# R2 score: 0.7340065334471744


n_estimators= 25
Root Mean Squared Error: 0.15984523440056372
Mean Squared Error: 0.025550498960571162
Mean Absolute Error: 0.09312174494755898
R2 score: 0.7265956694525533
n_estimators= 50
Root Mean Squared Error: 0.15965882557755529
Mean Squared Error: 0.025490940584804226
Mean Absolute Error: 0.09299614470961017
R2 score: 0.7272329766879297
n_estimators= 75
Root Mean Squared Error: 0.16031450204599124
Mean Squared Error: 0.02570073956625413
Mean Absolute Error: 0.09303108009810626
R2 score: 0.7249880126987192
n_estimators= 100
Root Mean Squared Error: 0.15983966897113475
Mean Squared Error: 0.025548719776801933
Mean Absolute Error: 0.09291564947902108
R2 score: 0.7266147076931802
n_estimators= 125
Root Mean Squared Error: 0.1597774160150231
Mean Squared Error: 0.025528822668437763
Mean Absolute Error: 0.09304166691575563
R2 score: 0.7268276176485016
n_estimators= 150
Root Mean Squared Error: 0.15948135510614364
Mean Squared Error: 0.02543430262649189
Mean Absolute Error: 0.092888333

(3)min_samples_leaf

In [42]:
# Baseline:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New models:
for i in range(1,20):
    print("min_samples_leaf",i*25)
    try_model(RandomForestRegressor(n_jobs=8,random_state=42,min_samples_leaf=25*i),
              input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
             )
# min_samples_leaf 50
# Root Mean Squared Error: 0.14541779599811633
# Mean Squared Error: 0.02114633539294978
# Mean Absolute Error: 0.08174422658044207
# R2 score: 0.7564370192968766


min_samples_leaf 25
Root Mean Squared Error: 0.15161557212486454
Mean Squared Error: 0.022987281710750002
Mean Absolute Error: 0.08722131362108851
R2 score: 0.7540234976650853
min_samples_leaf 50
Root Mean Squared Error: 0.1510716580395599
Mean Squared Error: 0.02282264586282173
Mean Absolute Error: 0.08766017721343798
R2 score: 0.7557851913939005
min_samples_leaf 75
Root Mean Squared Error: 0.1519417087002937
Mean Squared Error: 0.023086282842764906
Mean Absolute Error: 0.08917672221788239
R2 score: 0.7529641313386685
min_samples_leaf 100
Root Mean Squared Error: 0.15236553558767166
Mean Squared Error: 0.02321525643491804
Mean Absolute Error: 0.09035009669921891
R2 score: 0.7515840432756
min_samples_leaf 125
Root Mean Squared Error: 0.15300123024745838
Mean Squared Error: 0.02340937645723577
Mean Absolute Error: 0.0918520403266937
R2 score: 0.7495068527350343
min_samples_leaf 150
Root Mean Squared Error: 0.15352827691916332
Mean Squared Error: 0.023570931813767296
Mean Absolute Error:

(4)max_leaf_nodes

In [43]:
# Baseline:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New models:
for i in range(1,20):
    print("max_leaf_nodes",i*25)
    try_model(RandomForestRegressor(n_jobs=8,random_state=42,max_leaf_nodes=25*i),
              input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
             )
#  max_leaf_nodes 50
# Root Mean Squared Error: 0.14639915958187355
# Mean Squared Error: 0.02143271392627888
# Mean Absolute Error: 0.08365887010616885
# R2 score: 0.7531385182615492


max_leaf_nodes 25
Root Mean Squared Error: 0.1522466887329676
Mean Squared Error: 0.02317905423015312
Mean Absolute Error: 0.09108669918861928
R2 score: 0.7519714266912182
max_leaf_nodes 50
Root Mean Squared Error: 0.15233898398236154
Mean Squared Error: 0.0232071660407782
Mean Absolute Error: 0.08986926811544521
R2 score: 0.7516706149232646
max_leaf_nodes 75
Root Mean Squared Error: 0.15331364674986916
Mean Squared Error: 0.02350507427974367
Mean Absolute Error: 0.08993961839175967
R2 score: 0.7484828336292627
max_leaf_nodes 100
Root Mean Squared Error: 0.1540007289353624
Mean Squared Error: 0.02371622451262297
Mean Absolute Error: 0.09017506301575574
R2 score: 0.7462234105098008
max_leaf_nodes 125
Root Mean Squared Error: 0.15485510547644415
Mean Squared Error: 0.023980103692120645
Mean Absolute Error: 0.09046070598969047
R2 score: 0.7433997587867052
max_leaf_nodes 150
Root Mean Squared Error: 0.15564213258501633
Mean Squared Error: 0.0242244734356118
Mean Absolute Error: 0.090697148

(5)min_samples_split


In [44]:
# Baseline:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New models:
# for i in range(2,150):
#     print("min_samples_split",i)
#     try_model(RandomForestRegressor(n_jobs=8,random_state=42,min_samples_split=i),
#               input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
#              )


# min_samples_split 133
# Root Mean Squared Error: 0.1468922935407616
# Mean Squared Error: 0.02157734590166527
# Mean Absolute Error: 0.08267470149260632
# R2 score: 0.7514726506596461


(6)    max_features

In [45]:
# Baseline:

# RandomForest Regresor With default input columns+sealevelpressure+cloudcover+visibility+uvindex+winddir+snowdepth+precip
# Root Mean Squared Error: 0.15247443668728836
# Mean Squared Error: 0.023248453843105907
# Mean Absolute Error: 0.08627851418549864
# R2 score: 0.7322248697212221

# New models:

for i in range(1,len(input_cols)+8):
    print("max_features=",i)
    try_model(RandomForestRegressor(n_jobs=8,max_features=i,random_state=42),
          input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"])
    
# max_features= 6
# Root Mean Squared Error: 0.1496895756848935
# Mean Squared Error: 0.022406969068723456
# Mean Absolute Error: 0.08497653930714073
# R2 score: 0.7419170710438785


max_features= 1
Root Mean Squared Error: 0.15733354492191312
Mean Squared Error: 0.02475384435769566
Mean Absolute Error: 0.09625553248615829
R2 score: 0.7351203099581197
max_features= 2
Root Mean Squared Error: 0.15632248326787518
Mean Squared Error: 0.02443671877503512
Mean Absolute Error: 0.0936457858601136
R2 score: 0.7385137273532387
max_features= 3
Root Mean Squared Error: 0.15610024691381869
Mean Squared Error: 0.024367287086555157
Mean Absolute Error: 0.09189854414087284
R2 score: 0.7392566844413543
max_features= 4
Root Mean Squared Error: 0.15669445843768073
Mean Squared Error: 0.024553153305078054
Mean Absolute Error: 0.09277876597618756
R2 score: 0.7372678141212459
max_features= 5
Root Mean Squared Error: 0.15785505724956506
Mean Squared Error: 0.024918219099263464
Mean Absolute Error: 0.09313180876905805
R2 score: 0.7333614102103454
max_features= 6
Root Mean Squared Error: 0.15640343399317852
Mean Squared Error: 0.024462034164858554
Mean Absolute Error: 0.09238126330778693


Final RandomForestRegressor Model After Hypertuning

In [46]:
try_model(RandomForestRegressor(n_jobs=8,min_samples_leaf=50,random_state=42),
          input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"])
# Root Mean Squared Error: 0.14656805453691982
# Mean Squared Error: 0.021482194610737502
# Mean Absolute Error: 0.08636917939143215
# R2 score: 0.7525686009321385

Root Mean Squared Error: 0.15107165803955994
Mean Squared Error: 0.022822645862821733
Mean Absolute Error: 0.08766017721343801
R2 score: 0.7557851913939004


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",50
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [47]:
BEST_COLS_FOR_RF=input_cols+["sealevelpressure","cloudcover","visibility","uvindex","precip","snowdepth","winddir"]
max_depth_hyp=[5,6,7,8]
n_estimamtors_hyp=[150,225,300]
min_sample_leaf_hyp=[25,50]
max_leaf_nodes_hyp=[75,125]
min_samples_split_hyp=[133,50]
max_features_hyp=[3,6,8]

GridSearching

In [48]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "max_depth": max_depth_hyp,
    "min_samples_leaf": min_sample_leaf_hyp,
    "max_features": max_features_hyp,
    "n_estimators": n_estimamtors_hyp,
    "max_leaf_nodes":max_leaf_nodes_hyp,
    "min_samples_split":min_samples_split_hyp
}
grid = GridSearchCV(
    estimator=RandomForestRegressor(n_jobs=8,random_state=42),
    param_grid=param_grid,
    scoring="r2",
    cv=5,              # cross-validation
    n_jobs=8,
    verbose=1
)
grid.fit(train_df[BEST_COLS_FOR_RF], train_df[target_cols])


Fitting 5 folds for each of 288 candidates, totalling 1440 fits


0,1,2
,"estimator  estimator: estimator object This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed.",RandomForestR...ndom_state=42)
,"param_grid  param_grid: dict or list of dictionaries Dictionary with parameters names (`str`) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings.","{'max_depth': [5, 6, ...], 'max_features': [3, 6, ...], 'max_leaf_nodes': [75, 125], 'min_samples_leaf': [25, 50], ...}"
,"scoring  scoring: str, callable, list, tuple or dict, default=None Strategy to evaluate the performance of the cross-validated model on the test set. If `scoring` represents a single score, one can use: - a single string (see :ref:`scoring_string_names`); - a callable (see :ref:`scoring_callable`) that returns a single value; - `None`, the `estimator`'s  :ref:`default evaluation criterion ` is used. If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric  names and the values are the metric scores; - a dictionary with metric names as keys and callables as values. See :ref:`multimetric_grid_search` for an example.",'r2'
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",8
,"refit  refit: bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given ``cv_results_``. In that case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this ``GridSearchCV`` instance. Also for multiple metric evaluation, the attributes ``best_index_``, ``best_score_`` and ``best_params_`` will only be available if ``refit`` is set and all of them will be determined w.r.t this specific scorer. See ``scoring`` parameter to know more about multiple metric evaluation. See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` to see how to design a custom selection strategy using a callable via `refit`. See :ref:`this example ` for an example of how to use ``refit=callable`` to balance model complexity and cross-validated score. .. versionchanged:: 0.20  Support for callable added.",True
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",5
,"verbose  verbose: int Controls the verbosity: the higher, the more messages. - >1 : the computation time for each fold and parameter candidate is  displayed; - >2 : the score is also displayed; - >3 : the fold and candidate parameter indexes are also displayed  together with the starting time of the computation.",1
,"pre_dispatch  pre_dispatch: int, or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use  this for lightweight and fast-running jobs, to avoid delays due to on-demand  spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'",'2*n_jobs'
,"error_score  error_score: 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error.",
,"return_train_score  return_train_score: bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different parameter settings impact the overfitting/underfitting trade-off. However computing the scores on the training set can be computationally expensive and is not strictly required to select the parameters that yield the best generalization performance. .. versionadded:: 0.19 .. versionchanged:: 0.21  Default value was changed from ``True`` to ``False``",False

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",150
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",8
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",50
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",50
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",6
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",75
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [49]:
grid.best_params_


{'max_depth': 8,
 'max_features': 6,
 'max_leaf_nodes': 75,
 'min_samples_leaf': 50,
 'min_samples_split': 50,
 'n_estimators': 150}

In [65]:
# m=try_model(RandomForestRegressor(max_depth=8, max_features=6, max_leaf_nodes=75,
#                       min_samples_leaf=50, min_samples_split=50,
#                       n_estimators=150, n_jobs=8, random_state=42)
# ,BEST_COLS_FOR_RF)
model_RF_Final=try_model(RandomForestRegressor(n_jobs=8,min_samples_leaf=8,max_depth=8,max_features=8,random_state=42),BEST_COLS_FOR_RF)
predictions=model_RF_Final.predict(train_df[BEST_COLS_FOR_RF])


Root Mean Squared Error: 0.15126833379503943
Mean Squared Error: 0.022882108809127465
Mean Absolute Error: 0.089008561620482
R2 score: 0.7551489053060169


In [66]:
print("Root Mean Squared Error:",root_mean_squared_error(train_df[target_cols],predictions))
print("Mean Squared Error:",mean_squared_error(train_df[target_cols],predictions))
print("Mean Absolute Error:",mean_absolute_error(train_df[target_cols],predictions))
print("R2 score:",r2_score(train_df[target_cols],predictions))

Root Mean Squared Error: 0.12738665399906388
Mean Squared Error: 0.01622735961707722
Mean Absolute Error: 0.06825753465667436
R2 score: 0.760700967687648


In [68]:
predictions_test=model_RF_Final.predict(test_df[BEST_COLS_FOR_RF])
print("Root Mean Squared Error:",root_mean_squared_error(test_df[target_cols],predictions_test))
print("Mean Squared Error:",mean_squared_error(test_df[target_cols],predictions_test))
print("Mean Absolute Error:",mean_absolute_error(test_df[target_cols],predictions_test))
print("R2 score:",r2_score(test_df[target_cols],predictions_test))

Root Mean Squared Error: 0.13656841540238068
Mean Squared Error: 0.018650932085517206
Mean Absolute Error: 0.07159641090033655
R2 score: 0.671519057796983


XGBRegressor

In [53]:
from xgboost import XGBRegressor
m=try_model(XGBRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=8
)
,BEST_COLS_FOR_RF)

Root Mean Squared Error: 0.18499568559474835
Mean Squared Error: 0.03422340368867098
Mean Absolute Error: 0.11531653074033259
R2 score: 0.6337908395059012


In [63]:
try_model(XGBRegressor(random_state=42,n_jobs=42),BEST_COLS_FOR_RF)

Root Mean Squared Error: 0.18128343025051474
Mean Squared Error: 0.03286368208339324
Mean Absolute Error: 0.1124228238675845
R2 score: 0.6483406052774245


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [64]:
try_model(XGBRegressor(random_state=42,n_jobs=8,max_depth=24,learning_rate=0.03),BEST_COLS_FOR_RF)

Root Mean Squared Error: 0.17303359662479004
Mean Squared Error: 0.029940625560910546
Mean Absolute Error: 0.10397510199081812
R2 score: 0.679618910758464


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


Exporting Random Forest Model

In [72]:
import joblib

artifact = {
    "model": model_RF_Final,
    "input_columns": BEST_COLS_FOR_RF,
    "target_column": target_cols,
    "train_df": train_df,
    "val_df": val_df,
    "test_df":test_df,
    
}
joblib.dump(artifact,"RF_artifact.joblib")

['RF_artifact.joblib']

In [54]:
# Linear Regression With default input columns+sealevelpressure+cloudcover+visibility
# Root Mean Squared Error: 0.15424102275397691
# Mean Squared Error: 0.023790293100192827
# Mean Absolute Error: 0.09847370547477198
# R2 score: 0.7259839782350284
#approach
# 1. Understanding relationship between target and numeric features using correlation and scatter plots
# 2. Understanding relationship between target and categorical features using box plots and violin plots
# 3. If categorical features look important to the target we will do one hot encoding for that.
# (if there is only two possible values we can do binary encoding)
# 4. Plot the difference after encoding to see if there is any improvement in correlation
# 5. After all these steps we will standardize the numeric features using StandardScaler from sklearn so that weight is high for the most
#  affecting feature that affects the target the most.
# 6. Now whenever we get a new data point we will do the same transformations on that data point before feeding it to the model.

# few important points
# 1. split data into train test and validation sets according to year
# 2. Imputation
# 3. Scaling
# 4. Encoding

# Remember decision tree uses gini score s