# Advanced Pandas Assignment

This notebook contains five advanced Pandas exercises with their complete solutions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
%matplotlib inline

# Set random seed for reproducibility
np.random.seed(42)

## Question 1: Advanced Data Cleaning and Imputation

You are given a dataset of customer information with missing values, outliers, and inconsistent data types. Your task is to clean the data, handle missing values using advanced imputation techniques, and prepare it for analysis.

1. Load the dataset and display its information.
2. Identify and handle outliers in numerical columns.
3. Impute missing values using appropriate methods (mean, median, or advanced techniques like KNN imputation).
4. Handle inconsistent data types and format issues.
5. Create a function that automates this cleaning process for similar datasets.

In [None]:
# Create a sample dataset with issues
df = pd.DataFrame({
    'CustomerID': range(1, 1001),
    'Age': np.random.randint(18, 90, 1000),
    'Income': np.random.randint(20000, 200000, 1000),
    'Credit_Score': np.random.randint(300, 850, 1000),
    'Purchase_Amount': np.random.randint(100, 10000, 1000)
})

# Introduce missing values and outliers
df.loc[np.random.choice(df.index, 100, replace=False), 'Age'] = np.nan
df.loc[np.random.choice(df.index, 100, replace=False), 'Income'] = np.nan
df.loc[np.random.choice(df.index, 50, replace=False), 'Credit_Score'] = np.nan
df.loc[np.random.choice(df.index, 20, replace=False), 'Age'] = df['Age'].max() * 2
df.loc[np.random.choice(df.index, 20, replace=False), 'Income'] = df['Income'].max() * 10

print(df.info())
print(df.describe())

def clean_customer_data(df):
    # Make a copy of the dataframe
    df_clean = df.copy()
    
    # Handle outliers using IQR method
    for column in ['Age', 'Income', 'Credit_Score', 'Purchase_Amount']:
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean[column] = df_clean[column].clip(lower_bound, upper_bound)
    
    # Impute missing values
    df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)
    df_clean['Income'].fillna(df_clean['Income'].mean(), inplace=True)
    
    # Use KNN imputation for Credit_Score
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    df_clean['Credit_Score'] = imputer.fit_transform(df_clean[['Credit_Score']])[:, 0]
    
    # Ensure correct data types
    df_clean['Age'] = df_clean['Age'].astype(int)
    df_clean['Income'] = df_clean['Income'].astype(int)
    df_clean['Credit_Score'] = df_clean['Credit_Score'].astype(int)
    
    return df_clean

df_cleaned = clean_customer_data(df)
print("\nCleaned DataFrame:")
print(df_cleaned.info())
print(df_cleaned.describe())

## Question 2: Time Series Analysis and Forecasting

You have a dataset of daily stock prices for a company over the past 5 years. Your task is to analyze this time series data and create a simple forecasting model.

1. Load and prepare the time series data.
2. Perform time series decomposition to separate trend, seasonality, and residuals.
3. Implement a simple moving average model for forecasting.
4. Use an ARIMA model for more advanced forecasting.
5. Evaluate and compare the performance of both models.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Generate sample stock price data
dates = pd.date_range(start='2018-01-01', end='2022-12-31', freq='D')
prices = np.cumsum(np.random.randn(len(dates))) + 100  # Random walk with drift
df_stock = pd.DataFrame({'Date': dates, 'Price': prices})
df_stock.set_index('Date', inplace=True)

# Time series decomposition
decomposition = seasonal_decompose(df_stock['Price'], model='additive', period=365)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 16))
decomposition.observed.plot(ax=ax1)
ax1.set_title('Observed')
decomposition.trend.plot(ax=ax2)
ax2.set_title('Trend')
decomposition.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
decomposition.resid.plot(ax=ax4)
ax4.set_title('Residual')
plt.tight_layout()

# Simple Moving Average model
def moving_average_forecast(series, window):
    return series.rolling(window=window).mean()

ma_forecast = moving_average_forecast(df_stock['Price'], window=30)

# ARIMA model
train = df_stock['Price'][:int(0.8*len(df_stock))]
test = df_stock['Price'][int(0.8*len(df_stock)):]

model = ARIMA(train, order=(1, 1, 1))
results = model.fit()
arima_forecast = results.forecast(steps=len(test))

# Evaluate models
ma_mse = mean_squared_error(test, ma_forecast[-len(test):])
arima_mse = mean_squared_error(test, arima_forecast)

print(f"Moving Average MSE: {ma_mse}")
print(f"ARIMA MSE: {arima_mse}")

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(df_stock.index, df_stock['Price'], label='Actual')
plt.plot(df_stock.index, ma_forecast, label='Moving Average')
plt.plot(test.index, arima_forecast, label='ARIMA')
plt.legend()
plt.title('Stock Price Forecasting')
plt.show()

## Question 3: Advanced Data Transformation and Analysis

You have a large dataset of e-commerce transactions. Your task is to perform advanced data transformation and analysis to derive meaningful insights.

1. Load the dataset and perform any necessary data cleaning.
2. Use advanced groupby operations to analyze sales patterns.
3. Implement a customer segmentation based on recency, frequency, and monetary value (RFM analysis).
4. Create a function to identify the top products for each customer segment.
5. Visualize the results using appropriate plots.

In [None]:
# Generate sample e-commerce data
n_transactions = 100000
df_ecommerce = pd.DataFrame({
    'CustomerID': np.random.randint(1, 1001, n_transactions),
    'Date': pd.date_range(start='2022-01-01', end='2022-12-31', periods=n_transactions),
    'ProductID': np.random.randint(1, 101, n_transactions),
    'Quantity': np.random.randint(1, 10, n_transactions),
    'Price': np.random.uniform(10, 1000, n_transactions)
})

df_ecommerce['TotalAmount'] = df_ecommerce['Quantity'] * df_ecommerce['Price']

# Advanced groupby operations
sales_patterns = df_ecommerce.groupby([df_ecommerce['Date'].dt.month, 'ProductID'])['TotalAmount'].sum().unstack()
print("Monthly sales patterns for each product:")
print(sales_patterns.head())

# RFM Analysis
today = df_ecommerce['Date'].max()
rfm = df_ecommerce.groupby('CustomerID').agg({
    'Date': lambda x: (today - x.max()).days,
    'CustomerID': 'count',
    'TotalAmount': 'sum'
})
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# Segment customers
r_labels = range(4, 0, -1)
f_labels = range(1, 5)
m_labels = range(1, 5)

r_quartiles = pd.qcut(rfm['Recency'], q=4, labels=r_labels)
f_quartiles = pd.qcut(rfm['Frequency'], q=4, labels=f_labels)
m_quartiles = pd.qcut(rfm['Monetary'], q=4, labels=m_labels)

rfm['R'] = r_quartiles
rfm['F'] = f_quartiles
rfm['M'] = m_quartiles

rfm['RFM_Score'] = rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str)

# Function to identify top products for each segment
def top_products_by_segment(df, rfm, segment):
    segment_customers = rfm[rfm['RFM_Score'] == segment].index
    segment_transactions = df[df['CustomerID'].isin(segment_customers)]
    top_products = segment_transactions.groupby('ProductID')['TotalAmount'].sum().nlargest(5)
    return top_products

# Example: Top products for the best customers (segment '444')
best_customers_products = top_products_by_segment(df_ecommerce, rfm, '444')
print("\nTop 5 products for best customers:")
print(best_customers_products)

# Visualize RFM segments
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Recency', y='Frequency', size='Monetary', data=rfm, hue='RFM_Score', palette='viridis')
plt.title('Customer Segments based on RFM Analysis')
plt.show()

## Question 4: Advanced Data Visualization and Statistical Analysis

You have a dataset containing information about employees in a large company. Your task is to perform advanced data visualization and statistical analysis to uncover insights about employee performance and satisfaction.

1. Load and prepare the dataset.
2. Create advanced visualizations to explore relationships between variables.
3. Perform statistical tests to identify significant factors affecting employee performance.
4. Implement a simple predictive model for employee satisfaction.
5. Present your findings in a clear and concise manner.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate sample employee data
n_employees = 1000
df_employees = pd.DataFrame({
    'Age': np.random.randint(22, 65, n_employees),
    'Gender': np.random.choice(['Male', 'Female'], n_employees),
    'Education': np.random.choice(['Bachelor', 'Master', 'PhD'], n_employees),
    'Experience': np.random.randint(0, 40, n_employees),
    'Salary': np.random.randint(30000, 150000, n_employees),
    'Department': np.random.choice(['HR', 'IT', 'Finance', 'Marketing', 'Operations'], n_employees),
    'Performance': np.random.uniform(1, 5, n_employees),
    'Satisfaction': np.random.uniform(1, 5, n_employees)
})

# Advanced visualizations
plt.figure(figsize=(12, 6))
sns.boxplot(x='Department', y='Salary', hue='Gender', data=df_employees)
plt.title('Salary Distribution by Department and Gender')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Experience', y='Performance', hue='Education', size='Salary', data=df_employees)
plt.title('Experience vs Performance by Education and Salary')
plt.show()

# Statistical tests
from scipy import stats

# T-test for performance difference between genders
male_performance = df_employees[df_employees['Gender'] == 'Male']['Performance']
female_performance = df_employees[df_employees['Gender'] == 'Female']['Performance']
t_stat, p_value = stats.ttest_ind(male_performance, female_performance)
print(f"T-test for performance difference between genders: p-value = {p_value:.4f}")

# ANOVA for performance difference among departments
departments = df_employees['Department'].unique()
dept_performances = [df_employees[df_employees['Department'] == dept]['Performance'] for dept in departments]
f_stat, p_value = stats.f_oneway(*dept_performances)
print(f"ANOVA for performance difference among departments: p-value = {p_value:.4f}")

# Correlation analysis
correlation_matrix = df_employees[['Age', 'Experience', 'Salary', 'Performance', 'Satisfaction']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Predictive model for employee satisfaction
X = df_employees[['Age', 'Experience', 'Salary', 'Performance']]
y = df_employees['Satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"\nEmployee Satisfaction Prediction Model R-squared: {r2_score(y_test, y_pred):.4f}")
print("\nFeature Importance:")
for feature, importance in zip(X.columns, model.coef_):
    print(f"{feature}: {importance:.4f}")

## Question 5: Big Data Processing with Pandas and Dask

You have a very large dataset (several gigabytes) of customer  transactions that doesn't fit into memory. Your task is to process this data using Pandas and Dask to perform analysis and generate insights.

1. Set up a Dask DataFrame to handle the large dataset.
2. Perform basic exploratory data analysis using Dask.
3. Implement a data processing pipeline that includes filtering, grouping, and aggregation.
4. Compare the performance of Pandas and Dask for a subset of the data.
5. Visualize the results of your analysis.

In [None]:
import dask.dataframe as dd
import time

# Generate a large dataset (for demonstration, we'll use a smaller one)
n_rows = 10_000_000
df = pd.DataFrame({
    'customer_id': np.random.randint(1, 100001, n_rows),
    'transaction_date': pd.date_range(start='2020-01-01', end='2022-12-31', periods=n_rows),
    'amount': np.random.uniform(10, 1000, n_rows),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows)
})

# Save to CSV (in practice, this would be your large file)
df.to_csv('large_transactions.csv', index=False)

# Set up Dask DataFrame
ddf = dd.read_csv('large_transactions.csv')

# Basic exploratory data analysis using Dask
print("Dask DataFrame Info:")
print(ddf.info())

print("\nColumn Data Types:")
print(ddf.dtypes)

print("\nSummary Statistics:")
print(ddf.describe().compute())

# Data processing pipeline
def process_data(df):
    # Filter transactions above $500
    filtered = df[df['amount'] > 500]
    
    # Group by category and calculate total amount
    grouped = filtered.groupby('category')['amount'].sum().reset_index()
    
    # Calculate percentage of total for each category
    total = grouped['amount'].sum()
    grouped['percentage'] = grouped['amount'] / total * 100
    
    return grouped.sort_values('amount', ascending=False)

# Process with Dask
start_time = time.time()
dask_result = process_data(ddf).compute()
dask_time = time.time() - start_time
print("\nDask processing result:")
print(dask_result)
print(f"Dask processing time: {dask_time:.2f} seconds")

# Process with Pandas (using a subset of data)
pandas_df = pd.read_csv('large_transactions.csv', nrows=1_000_000)
start_time = time.time()
pandas_result = process_data(pandas_df)
pandas_time = time.time() - start_time
print("\nPandas processing result (subset):")
print(pandas_result)
print(f"Pandas processing time (subset): {pandas_time:.2f} seconds")

# Visualize results
plt.figure(figsize=(10, 6))
plt.bar(dask_result['category'], dask_result['percentage'])
plt.title('Percentage of Total Amount by Category (Transactions > $500)')
plt.xlabel('Category')
plt.ylabel('Percentage of Total Amount')
plt.show()

# Compare performance
print(f"\nDask processed the entire dataset {n_rows:,} rows in {dask_time:.2f} seconds")
print(f"Pandas processed a subset of 1,000,000 rows in {pandas_time:.2f} seconds")
print(f"Estimated time for Pandas to process the entire dataset: {pandas_time * (n_rows / 1_000_000):.2f} seconds")

This completes the advanced assignment with five questions covering various aspects of data analysis, visualization, and processing using Pandas and related libraries. Each question includes a detailed problem statement and a comprehensive solution that demonstrates advanced techniques in data manipulation, analysis, and visualization.

The questions cover:
1. Advanced Data Cleaning and Imputation
2. Time Series Analysis and Forecasting
3. Advanced Data Transformation and Analysis
4. Advanced Data Visualization and Statistical Analysis
5. Big Data Processing with Pandas and Dask

These exercises provide hands-on experience with real-world data challenges and advanced data science techniques using Python and Pandas.