# Learnplatform Covid19 Impact on Digital Learning

## This Notebook is deivided into Two main parts

## 1. Data Pre-processing
## 2. Exploratory Data Analysis (EDA)

## Data Reading and Pre-processing

### Importing Required packages and Libraries

In [1]:
# Importing necessary packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re

import warnings
warnings.filterwarnings("ignore")


### Path to different Data files, scripts files and files 

In [2]:
districts_data_path = "../data/districts_info.csv"
products_data_path = "../data/products_info.csv"
engagements_data_path = "../data/engagement_data/"

### Loading datasets

In [9]:
districts_data = pd.read_csv(districts_data_path)
products_data = pd.read_csv(products_data_path)

products_data.info()
districts_data.info()
print(f" Shape of Products_data: {products_data.shape}, and that of District_data is: {districts_data.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   LP ID                       372 non-null    int64 
 1   URL                         372 non-null    object
 2   Product Name                372 non-null    object
 3   Provider/Company Name       371 non-null    object
 4   Sector(s)                   352 non-null    object
 5   Primary Essential Function  352 non-null    object
dtypes: int64(1), object(5)
memory usage: 17.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   district_id               233 non-null    int64 
 1   state                     176 non-null    object
 2   locale                    176 non-null    object
 3   pct_black/hispanic

### Showing sample data

In [8]:
products_data.sample(4)

Unnamed: 0,LP ID,URL,Product Name,Provider/Company Name,Sector(s),Primary Essential Function
256,16164,https://www.nytimes.com,nytimes.com,The New York Times,PreK-12; Higher Ed; Corporate,"LC - Sites, Resources & Reference"
357,94218,http://www.viewpure.com/,ViewPure,ViewPure,PreK-12,CM - Classroom Engagement & Instruction - Clas...
47,30851,https://www.google.com/chrome/,Google Chrome,Google LLC,PreK-12; Higher Ed; Corporate,LC/CM/SDO - Other
110,49094,http://www.schooltube.com/,SchoolTube,SchoolTube,PreK-12; Higher Ed,CM - Classroom Engagement & Instruction - Clas...


In [7]:
districts_data.sample(4)

Unnamed: 0,district_id,state,locale,pct_black/hispanic,pct_free/reduced,county_connections_ratio,pp_total_raw
67,5604,California,Suburb,"[0, 0.2[","[0, 0.2[","[0.18, 1[",
166,6665,,,,,,
51,7798,New York,Rural,"[0, 0.2[","[0.2, 0.4[","[0.18, 1[","[22000, 24000["
194,5510,Washington,City,"[0.2, 0.4[","[0.4, 0.6[","[0.18, 1[","[14000, 16000["


### Calculating the Missing Values, NA

In [14]:
def total_percent_missing_data(df):

        # Calculate total number of cells in dataframe
        totalCells = np.product(df.shape)

        # Count number of missing values per column
        missingCount = df.isnull().sum()

        # Calculate total number of missing values
        totalMissing = missingCount.sum()

        # Calculate percentage of missing values
        return round(((totalMissing/totalCells) * 100), 2)


def missing_data_per_column(df):
        item_list = []
        row_list = []
        new_columns=['Column', 'No. of Missing Values', '% Missing Values per column']
        total_no_data_per_column = df.shape[0]-1
        i=0
        for item in df.columns:
            no_missing_values = df[item].isna().sum()
            percentage = str(round(((no_missing_values/total_no_data_per_column) * 100), 2))+" %"
            row_list.append(item)
            row_list.append(no_missing_values)
            row_list.append(percentage)
            item_list.append(row_list)
            row_list = []

        df_data = pd.DataFrame(item_list, columns = new_columns)
        return df_data


missing_values = total_percent_missing_data(districts_data)
missing_df = missing_data_per_column(districts_data)

print(f" Summary of Missing Values in districts_data : {missing_values} %")
print("Missing values per column in districts_data")
missing_df

 Summary of Missing Values in districts_data : 27.1 %
Missing values per column in districts_data


Unnamed: 0,Column,No. of Missing Values,% Missing Values per column
0,district_id,0,0.0 %
1,state,57,24.57 %
2,locale,57,24.57 %
3,pct_black/hispanic,57,24.57 %
4,pct_free/reduced,85,36.64 %
5,county_connections_ratio,71,30.6 %
6,pp_total_raw,115,49.57 %


In [15]:
missing_values = total_percent_missing_data(products_data)
missing_df = missing_data_per_column(products_data)

print(f" Summary of Missing Values in products_data : {missing_values} %")
print("Missing values per column in products_data")
missing_df

 Summary of Missing Values in products_data : 1.84 %
Missing values per column in products_data


Unnamed: 0,Column,No. of Missing Values,% Missing Values per column
0,LP ID,0,0.0 %
1,URL,0,0.0 %
2,Product Name,0,0.0 %
3,Provider/Company Name,1,0.27 %
4,Sector(s),20,5.39 %
5,Primary Essential Function,20,5.39 %
