In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data=pd.read_csv('student-mat.csv')
data.head()

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;""F"";18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";17;""U"";""GT3"";""T"";1;1;""at_home"";""other"";..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [None]:
# Check data shape and basic information
print("Dataset Shape:", data.shape)
print("\nDataset Info:")
data.info()

In [None]:
# Display data summary statistics
print("Dataset Description:")
print(data.describe())
print("\n" + "="*50)
print("Data Types:")
print(data.dtypes)

In [None]:
# Check for missing values and unique values
print("Missing Values:")
print(data.isnull().sum())
print("\n" + "="*50)
print("Unique values in each column:")
for col in data.columns:
    print(f"{col}: {data[col].nunique()} unique values")
print("\n" + "="*50)
print("First 5 rows and last 5 rows:")
print("Head:")
print(data.head())
print("\nTail:")
print(data.tail())

# Data Formatting and Cleaning

Now we'll format and clean the data for better analysis:

In [None]:
# Format column names (remove spaces, standardize case)
print("Original column names:")
print(data.columns.tolist())

# Clean column names - remove extra spaces and standardize
data.columns = data.columns.str.strip()  # Remove leading/trailing spaces
data.columns = data.columns.str.replace(' ', '_')  # Replace spaces with underscores
data.columns = data.columns.str.lower()  # Convert to lowercase

print("\nFormatted column names:")
print(data.columns.tolist())

In [None]:
# Format categorical variables
# Convert categorical columns to proper categories
categorical_cols = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())

# Display unique values for categorical columns (first few for brevity)
for col in categorical_cols[:5]:  # Show first 5 categorical columns
    print(f"\n{col} unique values:")
    print(data[col].unique())

In [None]:
# Format numerical variables
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:", numerical_cols.tolist())

# Check for outliers using IQR method for numerical columns
print("\nOutlier Analysis (using IQR method):")
for col in numerical_cols[:5]:  # Show first 5 numerical columns
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers detected")

In [None]:
# Data visualization for better understanding
plt.figure(figsize=(15, 10))

# Plot histogram for numerical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
n_cols = len(numerical_cols)
n_rows = (n_cols + 3) // 4  # 4 columns per row

for i, col in enumerate(numerical_cols[:8]):  # Show first 8 numerical columns
    plt.subplot(n_rows, 4, i+1)
    plt.hist(data[col], bins=20, alpha=0.7)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()