In [None]:
#Import drive
from google.colab import drive
#Mount Google Drive
ROOT="/content/drive"
drive.mount(ROOT, force_remount=True)

In [None]:
# Clone github repository setup
# import join used to join ROOT path and MY_GOOGLE_DRIVE_PATH
# source: https://medium.com/p/5554d5824228
from os.path import join

# path to your project on Google Drive
MY_GOOGLE_DRIVE_PATH = 'MyDrive/dlss24'

# replace with your Github username
GIT_USERNAME = "CT-P"

# Replace with your github repository in this case we want
# to clone deep-learning-v2-pytorch repository
GIT_REPOSITORY = "dlss24"

PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH)

# It's good to print out the value if you are not sure
print("PROJECT_PATH: ", PROJECT_PATH)

In [None]:
%pwd

In [None]:
%cd {PROJECT_PATH}

In [None]:
%ls

In [None]:
%cd dlss24

In [None]:
!git status

In [None]:
!git branch

In [None]:
!git checkout workplace

skip these:

In [None]:
!git add .

In [None]:
!git config --global user.email "trindade.catarina@hotmail.com"
!git config --global user.name "CT-P"

In [None]:
!git commit -m"adding git commands"

In [None]:
!git push

In [None]:
!git status

# Week 2 - Class 2 - Data Analysis

This notebook have good example exercises to get you going on data analysis

# Coronavirus (COVID-19) Visualization & Prediction  
Coronavirus is a family of viruses that are named after their spiky crown. The novel coronavirus, also known as SARS-CoV-2, is a contagious respiratory virus that first reported in Wuhan, China. On 2/11/2020, the World Health Organization designated the name COVID-19 for the disease caused by the novel coronavirus. This notebook aims at exploring COVID-19 through data analysis and projections.

   Coronavirus Case Data is provided by <a href='https://github.com/CSSEGISandData/COVID-19'>Johns Hopkins University</a>
   <br>Learn more from the <a href='https://www.who.int/emergencies/diseases/novel-coronavirus-2019'>World Health Organization</a>
   <br>Learn more from the <a href='https://www.cdc.gov/coronavirus/2019-ncov'>Centers for Disease Control and Prevention</a>
   <br>Check out map visualizations from  <a href='https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6'>JHU CCSE Dashboard</a>
   <br>Source code is also on <a href='https://github.com/therealcyberlord'>my Github</a>
   
   
   ```Last update: 1/16/2023 7:41 PM ET. New Updates: Daily report data update for 1/16/23. time series data update for 1/15/23. ```

source: https://www.kaggle.com/code/therealcyberlord/coronavirus-covid-19-visualization-prediction/notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
import random
import math
import time
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator
%matplotlib inline
import warnings

plt.style.use('seaborn-poster')
warnings.filterwarnings("ignore")

In [None]:
confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
latest_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/01-15-2023.csv')

In [None]:
confirmed_df.head()

In [None]:
confirmed_df.shape

In [None]:
def count_elements_with_characters(lst):
    count = 0
    for element in lst:
        if any('a' <= char <= 'z' for char in element):
            count += 1
    return count

In [None]:
def count_non_date_elements(lst):
  from datetime import datetime

  count = 0
  for element in lst:
      try:
          datetime.strptime(element, '%m/%d/%y')
      except ValueError:
          count += 1
  return count


In [None]:
confirmed_cols = confirmed_df.columns
deaths_cols = deaths_df.columns

confirmed = confirmed_df.loc[:, confirmed_cols[4]:]
deaths = deaths_df.loc[:, deaths_cols[4]:]

In [None]:
count_elements_with_characters(confirmed_cols)


In [None]:
count_non_date_elements(confirmed)

In [None]:
num_dates = len(confirmed.columns)
ck = confirmed.columns
dk = deaths.columns

world_cases = []
total_deaths = []
mortality_rate = []



In [None]:
num_dates

In [None]:
# what is this number?
confirmed[ck[1143-1]].sum()

In [None]:
num_dates = len(confirmed.columns)
ck = confirmed.columns
dk = deaths.columns

world_cases = []
total_deaths = []
mortality_rate = []

# list comprehension of this?
for i in range(num_dates):
    confirmed_sum = confirmed[ck[i]].sum()
    death_sum = deaths[dk[i]].sum()

    world_cases.append(confirmed_sum)
    total_deaths.append(death_sum)

    # calculate rates
    mortality_rate.append(death_sum/confirmed_sum)

In [None]:
def daily_increase(data):
    d = []
    for i in range(len(data)):
        if i == 0:
            d.append(data[0])
        else:
            d.append(data[i]-data[i-1])
    return d

def moving_average(data, window_size):
    moving_average = []
    for i in range(len(data)):
        if i + window_size < len(data):
            moving_average.append(np.mean(data[i:i+window_size]))
        else:
            moving_average.append(np.mean(data[i:len(data)]))
    return moving_average

# window size
window = 7

# confirmed cases
world_daily_increase = daily_increase(world_cases)
world_confirmed_avg= moving_average(world_cases, window)
world_daily_increase_avg = moving_average(world_daily_increase, window)

# deaths
world_daily_death = daily_increase(total_deaths)
world_death_avg = moving_average(total_deaths, window)
world_daily_death_avg = moving_average(world_daily_death, window)

In [None]:
days_since_1_22 = np.array([i for i in range(len(ck))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)

In [None]:
days_in_future = 10
future_forcast = np.array([i for i in range(len(ck)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-10]

In [None]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))

In [None]:
# slightly modify the data to fit the model better (regression models cannot pick the pattern), we are using data from 8/1/22 and onwards for the prediction modeling
days_to_skip = 922
X_train_confirmed, \
X_test_confirmed, \
y_train_confirmed, \
y_test_confirmed = \
train_test_split(days_since_1_22[days_to_skip:], world_cases[days_to_skip:], test_size=0.07, shuffle=False)

In [None]:
X_train_confirmed

In [None]:

poly = PolynomialFeatures(degree=3)
poly_X_train_confirmed = poly.fit_transform(X_train_confirmed)
poly_X_test_confirmed = poly.fit_transform(X_test_confirmed)
poly_future_forcast = poly.fit_transform(future_forcast)

In [None]:
# polynomial regression
linear_model = LinearRegression(normalize=True, fit_intercept=True)
linear_model.fit(poly_X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(poly_X_test_confirmed)
linear_pred = linear_model.predict(poly_future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))

In [None]:
print(linear_model.coef_)

In [None]:
plt.plot(y_test_confirmed)
plt.plot(test_linear_pred)
plt.legend(['Test Data', 'Polynomial Regression Predictions'])

## Q16

In [None]:
# https://scikit-learn.org/stable/getting_started.html#fitting-and-predicting-estimator-basics
# Apply a Ramdom Forest Classifier and do the same plot

## Q17

In [None]:
import pandas as pd
from sklearn.datasets import load_iris

# Load the iris dataset
iris_data = load_iris()

# Create a DataFrame
<your-answer>

# Add the target (species) column to the DataFrame
<your-answer>

# Map target numerical values to species names
<your-answer>




In [None]:
# Display basic information
print("Basic Information:")

print("\nData Types:")


In [None]:

print("\nSummary Statistics:")


In [None]:

# Data Exploration
print("\nMean Values by Species:")



In [None]:

print("\nMedian Values by Species:")


In [None]:

print("\nMinimum Values by Species:")


In [None]:

print("\nMaximum Values by Species:")


In [None]:

print("\nCount of Species:")


# Submit your code to your Github

In [28]:
!git branch

  main[m
* [32mworkplace[m


In [29]:
!git checkout main

D	Class 2 - Lab 28.04 - Data Analysis.ipynb
M	commands_everytime.ipynb
Switched to branch 'main'
Your branch is up to date with 'origin/main'.


In [30]:
!git merge workplace

Already up to date.


In [32]:
!git config --global user.name "CT-P"
!git config --global user.email "trindade.catarina@hotmail.com"
!git config --global user.password "{GIT_TOKEN}"

In [33]:
!git commit -m "Merge workplace into main"

[main 41e7a39] Merge workplace into main
 1 file changed, 1 insertion(+), 1 deletion(-)
 rewrite commands_everytime.ipynb (86%)


In [34]:
!git push origin main

Enumerating objects: 1Enumerating objects: 5, done.
Counting objects:  20% (1/5)Counting objects:  40% (2/5)Counting objects:  60% (3/5)Counting objects:  80% (4/5)Counting objects: 100% (5/5)Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 1.51 KiB | 171.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas:   0% (0/2)[Kremote: Resolving deltas:  50% (1/2)[Kremote: Resolving deltas: 100% (2/2)[Kremote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/CT-P/dlss24.git
   46b428b..41e7a39  main -> main


In [35]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    Class 2 - Lab 28.04 - Data Analysis.ipynb[m
	[31mmodified:   commands_everytime.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mClass 3 - Lab 06.03 - Data Analysis.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")
