## MLops and DevOps Assignment 2

In [10]:
#import all the necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression

Let's see the first few rows of our dataset

In [11]:
covidDF = pd.read_csv('covid_data.csv')
covidDF.rename(columns=lambda x: x.lower(), inplace=True)
covidDF.head()

Unnamed: 0,date_reported,country_code,country,who_region,new_cases,cumulative_cases,new_deaths,cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0


Handle Missing Values

In [12]:
covidDF.dropna(inplace=True)
covidDF.isnull().sum()

date_reported        0
country_code         0
country              0
who_region           0
new_cases            0
cumulative_cases     0
new_deaths           0
cumulative_deaths    0
dtype: int64

Preprocess necessary columns

In [13]:
#preprocess the date column
covidDF['date_reported'] = pd.to_datetime(covidDF['date_reported'])
covidDF['Day'] = covidDF['date_reported'].dt.day  
covidDF['Month'] = covidDF['date_reported'].dt.month  
covidDF['Year'] = covidDF['date_reported'].dt.year  
covidDF['Weekday'] = covidDF['date_reported'].dt.weekday


#preprocess the country column
label_encoder = LabelEncoder()
covidDF['country'] = label_encoder.fit_transform(covidDF['country'])
covidDF['who_region'] = label_encoder.fit_transform(covidDF['who_region'])

#preprocess the case_counts, Min_maxscaler
scaler = MinMaxScaler()
covidDF['Normalized_new_cases'] = scaler.fit_transform(covidDF[['new_cases']])
covidDF['Normalized_cumulative_cases'] = scaler.fit_transform(covidDF[['cumulative_cases']])
covidDF['Normalized_new_deaths'] = scaler.fit_transform(covidDF[['new_deaths']])
covidDF['Normalized_cumulative_deaths'] = scaler.fit_transform(covidDF[['cumulative_deaths']])
covidDF.drop(['new_cases', 'cumulative_cases', 'new_deaths', 'cumulative_deaths'], axis=1, inplace=True)


### Predict new cases of country by month

In [14]:
X = covidDF[['Month', 'country', 'who_region', 'Normalized_cumulative_cases', 'Normalized_new_deaths', 'Normalized_cumulative_deaths']]
y = covidDF['Normalized_new_cases']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#define pipeline to train
pipeline = Pipeline(
    steps=[
        ('model', LinearRegression())
    ]
)
# fit the model into the pipeline
pipeline.fit(X_train, y_train)

y_prediction = pipeline.predict(X_test)

### Find top-3 predictive features according to 3 different methods for measuring predictiveness.

In [6]:
# Method 1: ANNOVA, F-value and SelectKBest
print("measuring predictivenesses using SelectKBest, with f_regression")
f_test = SelectKBest(score_func=f_regression, k=3)
f_test.fit(X_train, y_train)

selected_features_f_test = X_train.columns[f_test.get_support()]

print(selected_features_f_test)

print()

print("measuring predictivenesses using the randomforestregressor")
# Method 2: with RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

feature_importances = model.feature_importances_
sorted_feature_importances = sorted(zip(X_train.columns, feature_importances), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importances[:3]:
    print(f"{feature}: {importance}")

print()
# Method 3: mutal_info_regression
print("measuring predictivenesses using Elastic Net regression")
mutal_info_scores = mutual_info_regression(X_train, y_train)
sorted_feature_scores = sorted(zip(X_train.columns, mutal_info_scores), key=lambda x: x[1], reverse=True)

for feature, score in sorted_feature_scores[:3]:
    print(f"{feature}: {score}")


measuring predictivenesses using SelectKBest, with f_regression
Index(['Normalized_cumulative_cases', 'Normalized_new_deaths',
       'Normalized_cumulative_deaths'],
      dtype='object')

measuring predictivenesses using the randomforestregressor
Normalized_cumulative_cases: 0.4391672713857072
Normalized_cumulative_deaths: 0.22474185063473454
Normalized_new_deaths: 0.1543138577532856

measuring predictivenesses using Elastic Net regression
Normalized_cumulative_cases: 0.7487716341193389
Normalized_cumulative_deaths: 0.634306766536139
Normalized_new_deaths: 0.5582183597085004


### Accuracy Scores

In [9]:
#mean squared error
meanSquaredError = mean_squared_error(y_test, y_prediction)

print(f"mean squared error: {meanSquaredError}")

#mean absolute error
meanAbsoluteError = mean_absolute_error(y_test, y_prediction)

print(f"mean absolute error: {meanAbsoluteError}")

# R-2 score
r2Score = r2_score(y_test, y_prediction)
print(f'r2 score: {r2Score}')

mean squared error: 2.3248517419935346e-05
mean absolute error: 0.0004386689664164878
r2 score: 0.13742200853442543
