<div>
<a id="DataExplore"></a>
<h1 style="font-family: cursive; background-color:#03045eff; padding:20px; border-radius:10px; border:3px solid #869d9d;color:#FFFFFF;text-align:center" >Exploratory Data Analysis (EDA)</h1>
</div>

<div>
<a id="DataExplore"></a>
<h1 style="font-family: cursive; background-color:#6C5F83; padding:20px; border-radius:10px; border:3px solid #869d9d;color:#FFFFFF;text-align:center" >🔎🛠 General Exploration</h1>
</div>

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Previewing first few rows
</p>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Load the data
data = pd.read_csv('data.csv')

#First few rows of the data
print(data.head())

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Displaying share of the data
</p>

In [None]:
#Shape of the data
print(data.shape)

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Determining data types for columns
</p>

In [None]:
#Data types of the columns
print(data.dtypes)

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Checking for missing values
</p>

In [None]:
#Checking for missing values
print(data.isnull().sum())

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Seeing some summary statistics
</p>

In [None]:

#Summary statistics
print(data.describe())

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Determining some data distribution
</p>

In [None]:
#Histogram
data.column_name.plot(kind='hist')
plt.show()

#Box plot
data.boxplot(column='column_name')
plt.show()

#Density plot
sns.kdeplot(data['column_name'])
plt.show()

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Previewing for data correlation
</p>

In [None]:
# Correlation matrix
corr = data.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
plt.show()

#Check the correlation between two variables
corr_val = data['column_name'].corr(data['column_name'])
print(corr_val)

<div>
<a id="DataExplore"></a>
<h1 style="font-family: cursive; background-color:#6C5F83; padding:20px; border-radius:10px; border:3px solid #869d9d;color:#FFFFFF;text-align:center" >🔎🛠 Data Cleaning</h1>
</div>

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Removing duplicate data
</p>

In [None]:
#Remove duplicate rows from a DataFrame
data = data.drop_duplicates()

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Handling missing values
</p>

In [None]:
#Remove rows with missing values
data = data.dropna()

#Replace missing values with the mean or median
data['column_name'].fillna(data['column_name'].mean(), inplace=True)
data['column_name'].fillna(data['column_name'].median(), inplace=True)

#Replace missing values with a value at the same position of another column
data['column_name'].fillna(data['another_column_name'], inplace=True)


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Fixing data in the wrong format
</p>

In [None]:
#Convert data from one format to another
data['column_name'] = data['column_name'].astype(float)
data['column_name'] = data['column_name'].astype(str)

#Parse date and time data
data['column_name'] = pd.to_datetime(data['column_name'], format='%Y-%m-%d')


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Removing outliers using z-score
</p>

In [None]:
#Remove outliers using z-score
from scipy import stats
data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]

#Remove outliers using Interquartile range (IQR)
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]


<div>
<a id="DataExplore"></a>
<h1 style="font-family: cursive; background-color:#6C5F83; padding:20px; border-radius:10px; border:3px solid #869d9d;color:#FFFFFF;text-align:center" >🔎🛠 Data Transformation</h1>
</div>

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Normalizing data
</p>

In [None]:
# Normalize the data using Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['column_name']] = scaler.fit_transform(data[['column_name']])


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Scaling data
</p>

In [None]:
# Scale the data using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['column_name']] = scaler.fit_transform(data[['column_name']])


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Encoding categorical variables
</p>

In [None]:
# Encode categorical variables using One-Hot Encoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[['column_name']])
encoded_data = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['column_name']))
data = data.drop('column_name', axis=1)
data = pd.concat([data, encoded_data], axis=1)


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Binning
</p>

In [None]:
# Binning 
bins = [0, 10, 20, 30, 40, 50]
data['column_name'] = pd.cut(data['column_name'], bins)


<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Transforming non-numeric data
</p>

In [None]:
# Transforming Non-Numeric Data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['column_name'] = encoder.fit_transform(data['column_name'])


<div>
<a id="DataExplore"></a>
<h1 style="font-family: cursive; background-color:#6C5F83; padding:20px; border-radius:10px; border:3px solid #869d9d;color:#FFFFFF;text-align:center" >🔎🛠 Data Visualization</h1>
</div>

<p style="background-color:#F0EACF; font-size:16px;padding:15px;color:#03045eff; font-family: cursive; border-radius:10px">
Common types of visualizations
</p>

In [None]:
#Line Plot
data.plot(x='column_name', y='column_name', kind='line')
plt.show()

#Histogram
data.column_name.plot(kind='hist')
plt.show()

#Scatter Plot
plt.scatter(data['column_name'], data['column_name'])
plt.show()

#Pie Chart
data.plot(kind='pie', y='column_name', labels=data['column_name'])
plt.show()

#Box Plot
data.boxplot(column='column_name')
plt.show()

#Pairplot
sns.pairplot(data)
plt.show()
