# Machine Learning Assignment 1 - part 2

### Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error



# Question 2: Weather Forecast

### Reading Dataset

In [2]:
train_data_path = 'Datasets\weatherHistory\weatherHistory_Train.csv'
test_data_path = 'Datasets\weatherHistory\weatherHistory_Test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print(f'Train Data Shape: {train_data.shape}')
print(f'Test Data Shape: {test_data.shape}')

Train Data Shape: (67517, 12)
Test Data Shape: (28936, 11)


# Data Purification

### Drop All Null Values

In [3]:
train_data = train_data.dropna()
train_data['Precip Type'].isna().sum()

0

### Encode Features and Feature Engineering

In [4]:
labelencoder =LabelEncoder()

# Rain: 0, Snow: 1
train_data['Precip_Types_Cat'] = labelencoder.fit_transform(train_data['Precip Type']) 
train_data = train_data.drop('Precip Type', axis = 1)

test_data['Precip_Types_Cat'] = labelencoder.fit_transform(test_data['Precip Type']) 
test_data = test_data.drop('Precip Type', axis = 1)


# 25 Types
train_data['Summary_Cat'] = labelencoder.fit_transform(train_data['Summary'])
train_data = train_data.drop('Summary', axis = 1)

test_data['Summary_Cat'] = labelencoder.fit_transform(test_data['Summary'])
test_data = test_data.drop('Summary', axis = 1)


# 181 Types - Deleted
train_data = train_data.drop('Daily Summary', axis = 1)
test_data = test_data.drop('Daily Summary', axis = 1)


# This col contains only zeros - Better to be deleted
train_data = train_data.drop('Loud Cover', axis = 1)
test_data = test_data.drop('Loud Cover', axis = 1)


# Keep only months of the Formatted Date
train_data['Formatted Date'] = pd.to_datetime(train_data['Formatted Date'], utc=True)
train_data['Month'] = train_data['Formatted Date'].dt.month
train_data = train_data.drop('Formatted Date', axis = 1)

test_data['Formatted Date'] = pd.to_datetime(test_data['Formatted Date'], utc=True)
test_data['Month'] = test_data['Formatted Date'].dt.month
test_data = test_data.drop('Formatted Date', axis = 1)

In [5]:
train_data.head()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),Precip_Types_Cat,Summary_Cat,Month
0,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13,0,19,3
1,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63,0,19,3
2,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94,0,17,4
3,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41,0,19,4
4,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51,0,17,4


In [6]:
test_data.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),Precip_Types_Cat,Summary_Cat,Month
0,13.8,0.77,12.3809,211.0,11.2056,1013.92,0,15,5
1,14.933333,0.72,11.2056,230.0,11.2056,1014.46,0,15,5
2,16.872222,0.64,14.0714,233.0,10.3523,1014.91,0,15,5
3,17.8,0.64,12.88,233.0,10.0464,1015.25,0,13,5
4,18.933333,0.56,20.3987,250.0,11.27,1015.26,0,13,5


### Extract Targets

In [8]:
train_target = train_data['Apparent Temperature (C)']
train_data = train_data.drop('Apparent Temperature (C)', axis = 1)

In [9]:
print(train_data.shape)
print(test_data.shape)

(67374, 9)
(28936, 9)


# Data Normalization: Standard Scaler

In [10]:
scaler = StandardScaler()
normalized_train_dataset = scaler.fit_transform(train_data)
normalized_test_dataset = scaler.fit_transform(test_data)
normalized_test_dataset.shape

(28936, 9)

# Linear Regression (with L1 Regularizatoin)

In [11]:
model = Lasso(alpha=1.3)
model.fit(normalized_train_dataset, train_target)
y_pred = model.predict(normalized_test_dataset)

# Save Result as .CSV File

In [12]:
df = pd.DataFrame(y_pred, columns=['Apparent Temperature (C)'])
df.to_csv('weather_pred.csv', index=False)