# 1. Ingest and Access Data

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('')
df = pd.read_excel('') # skiprows=4, usecols= range(1,14) insert in argument if you have blank rows or columns

Take a quick look at the headings and data. Pull column names if desired.

In [None]:
df.head()
col_list = df.columns.to_list()

Evaluate the information in the dataframe. Start making a plan of what to do next.

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Deeper dive on a specific column
df['Column Name'].value_counts()

Are there nulls or duplicates?

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

# 2. Clean Data
Establish another dataframe to work from.

In [None]:
df_clean = df

### a. Handle Missing Values
- What does it look like to drop all nulls?
- Are there any columns with lots of missing data? Can we remove them?
- In numeric columns, can we replace the missing data with the mean or median? (Based on the distribution)

In [None]:
# Drop all nulls info
df_clean.dropna().info() 

In [None]:
# Drop a column
df_clean = df_clean.drop('Column Name', axis=1)

In [None]:
# Fill a column with the mean or median
df_clean["Column Name"] = df_clean['Column Name'].fillna( df_clean['Column Name'].mean() ) #or .median()

### b. Do we need to handle any duplicates?

In [None]:
df_clean = df_clean.drop_duplicates()

### c. Erroneous Values
- Are there any values that don't make sense?
- Gather more context for data if needed.

### d. Handle Outliers
- This should handle anything we didn't catch in the previous step.

In [None]:
import statistics as stats

# outlier function
def outliers_IQR (column):
    Q1 = column.quantile(.25)
    Q3 = column.quantile(.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers 

# apply function
df_clean['Column Name'] = outliers_IQR(df_clean['Column Name'])

### e. Check Data Types
- Do any data types need to be changed after cleaning is complete?

In [None]:
df_clean.dtypes

In [None]:
# Change to numeric
df_clean["Column Name"] = pd.to_numeric(df_clean['Column Name'])

### f. Data Structure
- Add any necessary calculated or reformatted columns for visualization/analysis

### g. Data Enrichment
- Do we need more data?

# 3. Analyze/Visualize Data

## Export to .csv and complete in another program

In [None]:
outgoing_df = df_clean.to_csv('FileName.csv', index = True)

## Complete here in the Juptyer notebook.

In [None]:
import matplotlib as plt
import seaborn as sns

### Histogram

In [None]:
plt.figure(figsize = (10, 5))
sns.histplot(data=df_clean, x= '')
plt.title('')
plt.xlabel('Response')
plt.show()

### Pi Chart

In [None]:
plt.figure(figsize = (5, 5))

data = df_clean['Column Name']

data_labels = data.unique().tolist()
data_percents = data.value_counts(normalize= True)

palette_color = sns.color_palette('pastel')

plt.pie(data_percents, labels= data_labels,
        autopct='%1.1f%%', pctdistance=0.85, colors= palette_color)

plt.title('')

plt.show()

### Line Graph

In [None]:
sns.lineplot(
	x= time, y= total_units)

### Scatterplot

In [None]:
sns.relplot(
	x= temps, y= sales)

### Multiline Plot

In [None]:
by_flavor= sales_flavor[["Month", 'Vanilla', 'Chocolate', 'Strawberry', 'Neapolitan', 'Mango', 'Lime']]

sns.lineplot(data= pd.melt(by_flavor, ["Month"]), y= 'value', x= 'Month', hue='variable')

plt.title('Total Units Sold by Month')
plt.xlabel('Month')
plt.xticks (time)
plt.ylabel('Total Units Sold (quantity)')
plt.legend(sales_flavor)
plt.show()