In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge, RidgeCV, LassoCV

# 1. Load the data
data = pd.read_csv('m1_final.csv')

# 2. Exploratory Data Analysis (EDA)
data.head()



In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.select_dtypes(["object"]).head()

In [None]:
data["Dew Point"] = data["Dew Point"].astype("int64")

In [None]:
# Calculate the mode of the wind variable
mode = data['Wind'].mode().values[0]

# Replace missing values in wind variable with mode
data['Wind'].fillna(mode, inplace=True)
## Verify if the values have been replaced
data.isnull().sum()


In [None]:
data.describe()

In [None]:
print(data["DEP_DELAY"].describe())

In [None]:
corr_matrix = data.corr(numeric_only=True)
plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.tight_layout
plt.show()

In [None]:
data.hist(bins=50, figsize=(15,15), color= 'Green',)
plt.show()

In [None]:
fig, ax = plt.subplots()
figsize = (15,15)
sns.boxplot(data=data, ax=ax)
plt.xticks(rotation=90)
#fig.tight_layout()
plt.show()


In [None]:
data.head()

In [None]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
data['OP_UNIQUE_CARRIER'] = label_encoder.fit_transform(data['OP_UNIQUE_CARRIER'].astype(str))
data['DEST'] = label_encoder.fit_transform(data['DEST'].astype(str))
data['Wind'] = label_encoder.fit_transform(data['Wind'].astype(str))
data['Condition'] = label_encoder.fit_transform(data['Condition'].astype(str))
data['TAIL_NUM'] = label_encoder.fit_transform(data['TAIL_NUM'].astype(str))

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Assuming 'data' is your DataFrame
# Replace 'data' with the actual name of your DataFrame

# Create a scaler object
scaler = MinMaxScaler()

# Fit and transform the data
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

# Now 'data_normalized' is the normalized version of 'data'



In [None]:
#select the independent and dependent variable
X = data_normalized.iloc[:,0:22]
y= data_normalized.iloc[:,22]

In [None]:
#data splitting
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.20, random_state=32)

In [None]:
## linear regression
linear_reg = LinearRegression().fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)
pred = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred})
pred

In [None]:
Mean_Absolute_Error = metrics.mean_absolute_error(y_test, y_pred)
Mean_Sq_Error = metrics.mean_squared_error(y_test, y_pred)
Root_Mean_Sqrr = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('R squared:{:.2f}'.format(linear_reg.score(X_test, y_test)))
print('Mean Absolute Error:', Mean_Absolute_Error)
print('Mean Square Error:', Mean_Sq_Error)
print('Root Mean Square Error:', Root_Mean_Sqrr)

In [None]:
model_lasso= Lasso(alpha=0.001)
model_lasso.fit(X_train,y_train)
pred_lasso = model_lasso.predict(X_test)
pred_lasso
lasso_df = pd.DataFrame({'Actual_score':y_test, 'Predicted_score':pred_lasso})
lasso_df


In [None]:
L_Mean_Absolute_Error = metrics.mean_absolute_error(y_test, pred_lasso)
L_Mean_Sq_Error = metrics.mean_squared_error(y_test, pred_lasso)
L_Root_Mean_Sqrr = np.sqrt(metrics.mean_squared_error(y_test, pred_lasso))

print('R squared:{:.2f}'.format(linear_reg.score(X_test, y_test)))
print('Mean Absolute Error:', L_Mean_Absolute_Error)
print('Mean Square Error:', L_Mean_Sq_Error)
print('Root Mean Square Error:', L_Root_Mean_Sqrr)

In [None]:
# Define the alphas 
alphas = [0.001, 0.01, 0.1, 1]

# Create Lasso regression with the possible alpha values
lassocv = LassoCV(alphas=alphas, cv=5)

# Fit the model
lassocv.fit(X_train, y_train)

# The best alpha value is stored in `lassocv.alpha_`
print("Best alpha: ", lassocv.alpha_)

In [None]:
model_ridge = Ridge(alpha=0.1)
model_ridge.fit(X_train, y_train)
pred_ridge = model_ridge.predict(X_test)

ridge_df = pd.DataFrame({'Actual_score': y_test, 'Predicted_score': pred_ridge})
ridge_df

In [54]:
R_Mean_Absolute_Error = metrics.mean_absolute_error(y_test, pred_ridge)
R_Mean_Sq_Error = metrics.mean_squared_error(y_test, pred_ridge)
R_Root_Mean_Sqrr = np.sqrt(metrics.mean_squared_error(y_test, pred_ridge))

print('R squared:{:.2f}'.format(linear_reg.score(X_test, y_test)))
print('Mean Absolute Error:', R_Mean_Absolute_Error)
print('Mean Square Error:', R_Mean_Sq_Error)
print('Root Mean Square Error:', R_Root_Mean_Sqrr)

R squared:0.09
Mean Absolute Error: 0.1465132443027173
Mean Square Error: 0.03379315508292199
Root Mean Square Error: 0.18382914644561124


In [53]:
# Define the alphas 
alphas = [0.001, 0.01, 0.1, 1]

# Create ridge regression with three possible alpha values
ridgecv = RidgeCV(alphas=alphas, cv=5)

# Fit the model
ridgecv.fit(X_train, y_train)

# The best alpha value is stored in `ridgecv.alpha_`
print("Best alpha: ", ridgecv.alpha_)

Best alpha:  0.1
