Imports & File Structure

In [1]:
# Structure of CSV file
# F1: Date, Manufacturer, Type of Device, Exposed
# F2: Date, Manufacturer, Exposed, Number of Countries
# F3: Date, type, amount
# F4: Date, Exploit, Infected, Number of Countries

from os import listdir
import pandas as pd
from pandas.core.frame import DataFrame

## Retrieve the various infections

In [2]:
files = listdir('data/infected')

df_final = pd.DataFrame(columns=['date','infection','count'])

for f in files:
  df = pd.read_csv(f'data/infected/{f}')
  df = df.drop('geo', 1)

  if df.empty:
    df = df.append({'infection': '?','count': 0}, ignore_index=True)
  else:
    df = df[pd.notna(df.infection)]

  df_infections = df.groupby(['infection']).sum()
  
  df_infections = df_infections.reset_index()
  df_infections.insert(0, 'date', f[:10])
  
  df_final = df_final.append(df_infections)

In [4]:

df_final = df_final.drop(['country'], 1)
df_final = df_final.sort_values(by=['date', 'count'])
df_final.reset_index(drop=True, inplace=True)
df_final.to_csv('infected_by_infection.csv', index=False)

  df_final = df_final.drop(['country'], 1)


In [5]:
df_final

Unnamed: 0,date,infection,count
0,2021-05-03,CVE-2014-2321,1
1,2021-05-03,CVE-2020-8958,1
2,2021-05-03,SSV-97217,1
3,2021-05-03,CVE-2020-8515,3
4,2021-05-03,qsnatch,3
...,...,...,...
21457,2022-05-04,android.rootnik,27568
21458,2022-05-04,telnet-brute-force,28826
21459,2022-05-04,android.bakdoor.prizmes,61329
21460,2022-05-04,mirai,97244


## Retrieve number of strains

In [6]:
files = listdir('data/infected')
def count_values(values):
  if values.strip() == '':
    return 0
  else:
    return len(values.split(';'))
df_final = pd.DataFrame(columns=['date','strain_count'])

In [7]:
for f in files:
  df = pd.read_csv(f'data/infected/{f}')
  df = df.drop(['geo', 'country'], 1)

  if df.empty:
    df_infections = pd.DataFrame({'strain_count': {0: 0}})
  else:
    df = df[pd.notna(df.infection)]
    df = df.groupby(['infection']).sum()
    df_infections = pd.DataFrame({'strain_count': {0: df.shape[0]}})
    
  df_infections = df_infections.reset_index()
  
  df_infections.insert(0, 'date', f[:10])
  
  df_final = df_final.append(df_infections)

  df = df.drop(['geo', 'country'], 1)


In [8]:
df_final = df_final.sort_values(by=['date'])
df_final = df_final.drop('index', 1)
df_final.reset_index(drop=True, inplace=True)
df_final.to_csv('infected_by_strain.csv', index=False)

  df_final = df_final.drop('index', 1)


In [None]:
files = listdir('data/infected')

df_geo = pd.DataFrame(columns=['date','infection','country','count'])

for f in files:
  df = pd.read_csv(f'data/infected/{f}')
  df = df.drop(['country'], 1)

  df_infections = df.groupby(['geo', 'infection']).sum()
  
  df_infections = df_infections.reset_index()
  df_infections.insert(0, 'date', f[:10])
  
  df_geo = df_geo.append(df_infections)

df_geo

In [9]:
df_final

Unnamed: 0,date,strain_count
0,2021-05-03,36
1,2021-05-04,35
2,2021-05-05,34
3,2021-05-06,62
4,2021-05-07,67
...,...,...
358,2022-04-30,42
359,2022-05-01,41
360,2022-05-02,41
361,2022-05-03,41


## Geo-infected

In [17]:
files = listdir('data/infected')

df_geo = pd.DataFrame(columns=['date','infection','country','count'])

for f in files:
  df = pd.read_csv(f'data/infected/{f}')
  df = df.drop(['country'], 1)

  df_infections = df.groupby(['geo', 'infection']).sum()
  
  df_infections = df_infections.reset_index()
  df_infections.insert(0, 'date', f[:10])
  
  df_geo = df_geo.append(df_infections)

df_geo

  df = df.drop(['country'], 1)


KeyboardInterrupt: 

## Retrieve country infected

In [7]:
from os import listdir

files = listdir('data/infected')

df_final = pd.DataFrame(columns=['date','country','count'])

for f in sorted(files):
  df = pd.read_csv(f'data/infected/{f}')
  df = df.drop(['infection', 'geo'], axis=1)

  if not df.empty:
    df = df[pd.notna(df.country)][pd.notna(df['count'])].groupby(['country']).sum()
    df_data = df.reset_index()
    df_data.insert(0, 'date', f[:10])
    df_final = pd.concat([df_final, df_data])

df_final.reset_index(drop=True, inplace=True)
df_final.to_csv('infected_by_country.csv', index=False)