# **Checking up the data**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
file = 'training_.csv'
df = pd.read_csv(file)

In [3]:
df.shape

(23340, 23)

In [4]:
df.count()

time                    23340
location                23340
temperature             23260
dew_point               23259
wind_speed              23337
wind_direction          23112
total_cover             23340
pressure                23255
time_1_day              23340
temperature_1_day       23263
dew_point_1_day         23262
wind_speed_1_day        23337
wind_direction_1_day    23120
total_cover_1_day       23340
pressure_1_day          23258
time_2_day              23340
temperature_2_day       23266
dew_point_2_day         23265
wind_speed_2_day        23336
wind_direction_2_day    23113
total_cover_2_day       23340
pressure_2_day          23261
label                   23340
dtype: int64

In [5]:
df.describe(include='all')

Unnamed: 0,time,location,temperature,dew_point,wind_speed,wind_direction,total_cover,pressure,time_1_day,temperature_1_day,...,total_cover_1_day,pressure_1_day,time_2_day,temperature_2_day,dew_point_2_day,wind_speed_2_day,wind_direction_2_day,total_cover_2_day,pressure_2_day,label
count,23340,23340,23260.0,23259.0,23337.0,23112.0,23340.0,23255.0,23340,23263.0,...,23340.0,23258.0,23340,23266.0,23265.0,23336.0,23113.0,23340.0,23261.0,23340.0
unique,8249,3,,,,,,,8254,,...,,,8243,,,,,,,
top,2021-10-18 10:00:00+00:00,QAIA,,,,,,,2021-10-18 10:00:00+00:00,,...,,,2021-10-18 10:00:00+00:00,,,,,,,
freq,6,7846,,,,,,,6,,...,,,6,,,,,,,
mean,,,19.83031,7.007997,3.765795,204.132831,19.875321,1014.799441,,19.835361,...,19.883676,1014.792888,,19.834995,6.996475,3.759913,203.981655,19.858612,1014.793947,19.827506
std,,,9.172273,5.807325,2.659252,132.619094,21.108254,5.132478,,9.173669,...,21.104368,5.141182,,9.17742,5.799084,2.658981,132.741978,21.08022,5.14162,9.172048
min,,,-3.0,-16.0,0.0,0.0,0.0,1000.0,,-3.0,...,0.0,1000.0,,-3.0,-16.0,0.0,0.0,0.0,1000.0,-3.0
25%,,,13.0,3.0,2.06,60.0,0.0,1011.0,,13.0,...,0.0,1011.0,,13.0,3.0,2.06,60.0,0.0,1011.0,13.0
50%,,,20.0,7.0,3.6,250.0,15.0,1015.0,,20.0,...,15.0,1015.0,,20.0,7.0,3.6,250.0,15.0,1015.0,20.0
75%,,,27.0,11.0,5.66,310.0,15.0,1018.0,,27.0,...,15.0,1018.0,,27.0,11.0,5.66,310.0,15.0,1018.0,27.0


# **Pridiction**

In [6]:
df.dropna(subset=['temperature'], inplace=True)
df.dropna(subset=['dew_point'], inplace=True)
df.dropna(subset=['wind_speed'], inplace=True)
df.dropna(subset=['wind_direction'], inplace=True)
df.dropna(subset=['pressure'], inplace=True)
df.dropna(subset=['temperature_1_day'], inplace=True)
df.dropna(subset=['dew_point_1_day'], inplace=True)
df.dropna(subset=['wind_speed_1_day'], inplace=True)
df.dropna(subset=['wind_direction_1_day'], inplace=True)
df.dropna(subset=['pressure_1_day'], inplace=True)
df.dropna(subset=['temperature_2_day'], inplace=True)
df.dropna(subset=['dew_point_2_day'], inplace=True)
df.dropna(subset=['wind_speed_2_day'], inplace=True)
df.dropna(subset=['wind_direction_2_day'], inplace=True)
df.dropna(subset=['pressure_2_day'], inplace=True)

encoding

In [7]:
cat_columns = df.select_dtypes(['object']).columns
label_encoder = LabelEncoder()
df[cat_columns] = df[cat_columns].apply(LabelEncoder().fit_transform)

In [8]:
c = ['temperature', 'dew_point', 'wind_speed', 'wind_direction','pressure', 'temperature_1_day','dew_point_1_day', 'wind_speed_1_day'
, 'wind_direction_1_day', 'pressure_1_day', 'temperature_2_day', 'dew_point_2_day', 'wind_speed_2_day', 'wind_direction_2_day', 'pressure_2_day']
Q1 = df[c].quantile(0.25)
Q3 = df[c].quantile(0.75)
IQR = Q3 - Q1
threshold = 1.5
o = ((df[c] < (Q1 - threshold * IQR)) | (df[c] > (Q3 + threshold * IQR))).any(axis=1)
df.drop(df[o].index, inplace=True)

In [9]:
iteration=30

**select from model**

In [10]:
X = df.drop(["label"], axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
model = LinearRegression()
model.fit(X_train, y_train)
sfm = SelectFromModel(model, threshold="median")
sfm.fit(X_train, y_train)
selected_features = X.columns[sfm.get_support()]
print("Selected features:", selected_features)
X = X[selected_features]

Selected features: Index(['time', 'dew_point', 'pressure', 'time_1_day', 'wind_speed_1_day',
       'pressure_1_day', 'time_2_day', 'temperature_2_day', 'dew_point_2_day',
       'wind_speed_2_day', 'pressure_2_day'],
      dtype='object')


linear regression

In [11]:
lr = LinearRegression()
sum_LR_mse_SFM = []
sum_LR_r2_SFM = []

for i in range(iteration):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
  lr.fit(X_train, y_train)
  y_pred = lr.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  sum_LR_mse_SFM.append(mse)
  sum_LR_r2_SFM.append(r2)

avg_LR_mse_SFM = sum(sum_LR_mse_SFM) / iteration
avg_LR_r2_SFM = sum(sum_LR_r2_SFM) / iteration
print("Linear Regression:")
print("MSE =", avg_LR_mse_SFM)
print("R2 =", avg_LR_r2_SFM * 100)

Linear Regression:
MSE = 10.634056941889778
R2 = 87.23118118832687







Decision Tree Regression

In [12]:
X = df.drop(["label"], axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)

sfm = SelectFromModel(model, threshold="median")
sfm.fit(X_train, y_train)

selected_features = X.columns[sfm.get_support()]
print("Selected features:", selected_features)
X_selected = X[selected_features]

Selected features: Index(['time', 'temperature', 'dew_point', 'pressure', 'time_1_day',
       'temperature_1_day', 'time_2_day', 'temperature_2_day',
       'dew_point_2_day', 'wind_direction_2_day', 'pressure_2_day'],
      dtype='object')


In [13]:
dt = DecisionTreeRegressor()
sum_DT_mse_SFM = []
sum_DT_r2_SFM = []

for i in range(iteration):
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=i)
  dt.fit(X_train, y_train)
  y_pred = dt.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  sum_DT_mse_SFM.append(mse)
  sum_DT_r2_SFM.append(r2)

avg_DT_mse_SFM = sum(sum_DT_mse_SFM) / iteration
avg_DT_r2_SFM = sum(sum_DT_r2_SFM) / iteration
print("Decision Tree:")
print("MSE =", avg_DT_mse_SFM)
print("R2 =", avg_DT_r2_SFM * 100)

Decision Tree:
MSE = 12.42025108890597
R2 = 85.0850808611537


Randome forest

In [14]:
X = df.drop(["label"], axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

sfm = SelectFromModel(model, threshold='median')
sfm.fit(X_train, y_train)

features = X.columns[sfm.get_support()]
print("Selected features:", features)
X = X[selected_features]

Selected features: Index(['time', 'temperature', 'dew_point', 'pressure', 'time_1_day',
       'temperature_1_day', 'dew_point_1_day', 'time_2_day',
       'temperature_2_day', 'dew_point_2_day', 'pressure_2_day'],
      dtype='object')


In [15]:
rf = RandomForestRegressor()
RF_mse_SFM = []
RF_r2_SFM = []


for i in range(iteration):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    RF_mse_SFM.append(mse)
    RF_r2_SFM.append(r2)

aRF_mse_SFM = sum(RF_mse_SFM) / iteration
aRF_r2_SFM = sum(RF_r2_SFM) / iteration

In [16]:
print("Random Forest:")
print("MSE =", aRF_mse_SFM)
print("R2 =", aRF_r2_SFM * 100)

Random Forest:
MSE = 6.161915186266975
R2 = 92.60114081197368


In [20]:
y_pred = rf.predict(X)

In [23]:
df2 = pd.DataFrame()

In [24]:
df2['label']=y_pred

In [25]:
df2.to_csv('submit_3.csv', index=False)

In [26]:
df.to_csv('submit_4.csv', index=False)