# Import necessary libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load the CSV file into a pandas DataFrame

In [None]:
try:
    df = pd.read_csv('student-data/ResearchInformation3.csv') # Assuming your CSV is in a 'data' subfolder
except FileNotFoundError:
    df = pd.read_csv('ResearchInformation3.csv') # If CSV is in the same directory as notebook

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Get some basic info about the dataset (column names, data types, non-null counts)
df.info()

In [None]:
# Print DataFrame column names to check for 'Last_GPA'
print(df.columns)

In [None]:
# Bar chart of English Marks (for discrete data)
english_value_counts = df['English'].value_counts().sort_index() # Count frequencies and sort by mark value
plt.figure(figsize=(8, 6))
plt.bar(english_value_counts.index, english_value_counts.values)
plt.title('Frequency of Each English Mark')
plt.xlabel('English Marks')
plt.ylabel('Frequency (Number of Students)')
plt.xticks(english_value_counts.index) # Ensure x-axis ticks are at each mark value
plt.grid(axis='y', linestyle='--', alpha=0.7) # Add horizontal grid lines
plt.show()

# Box plot of English Marks to visualize distribution and outliers

In [None]:
plt.figure(figsize=(8, 6)) # Adjust figure size if needed
plt.boxplot(df['English'])
plt.title('Box Plot of English Marks')
plt.ylabel('English Marks')
plt.show()

In [None]:
# Bulk Feature Exploration - Multiple Features at Once

features_to_explore = [
    'Hometown', 'Income', 'Computer', 'Preparation', 'Gaming', 'Attendance',
    'Job', 'Extra', 'Semester', 'Last', 'Overall', 'HSC', 'SSC', 'Department'
] # List of feature names to explore

for feature_name in features_to_explore:
    print(f"\n--- Feature: {feature_name} ---")

    # Check data type
    print(f"  Data Type: {df[feature_name].dtype}")

    # Check unique values (for categorical features - let's check for object type for now)
    if df[feature_name].dtype == 'object':
        unique_values = df[feature_name].unique()
        print(f"  Unique Values: {unique_values}")
        print(f"  Number of Unique Values: {len(unique_values)}")

        # Bar chart for categorical features
        value_counts = df[feature_name].value_counts()
        plt.figure(figsize=(8, 5)) # Adjust figure size as needed
        plt.bar(value_counts.index, value_counts.values)
        plt.title(f'Distribution of {feature_name}')
        plt.xlabel(feature_name)
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right') # Rotate x-axis labels if needed
        plt.tight_layout() # Adjust layout to prevent labels from overlapping
        plt.show()

    elif df[feature_name].dtype in ['int64', 'float64']: # For numerical features
        # Descriptive statistics for numerical features
        print(f"  Descriptive Statistics:\n{df[feature_name].describe()}")

        # Histogram for numerical features
        plt.figure(figsize=(8, 5))
        plt.hist(df[feature_name], bins=20) # You can adjust bins as needed
        plt.title(f'Distribution of {feature_name}')
        plt.xlabel(feature_name)
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("  Data type not easily handled for exploration in this script.")
        print("  Manual exploration might be needed.")

# Data Preprocessing - One-Hot Encoding for Categorical Features in Bulk

In [6]:
categorical_features = [
    'Gender', 'Hometown', 'Computer', 'Preparation', 'Gaming', 'Attendance',
    'Job', 'Extra', 'Semester', 'Department', 'Income'
] # List of categorical feature names

for feature_name in categorical_features:
    dummies = pd.get_dummies(df[feature_name], prefix=feature_name) # One-hot encode each feature
    df = pd.concat([df, dummies], axis=1) # Concatenate dummies to original DataFrame
    df.drop(feature_name, axis=1, inplace=True) # Drop original categorical column

In [7]:
df.head()

Unnamed: 0,HSC,SSC,English,Last,Overall,Gender_Female,Gender_Male,Hometown_City,Hometown_Village,Computer_1,...,"Income_High (Above 50,000)","Income_High (Above 50,000).1","Income_High (Above 50,000).2","Income_Low (Below 15,000)","Income_Low (Below 15,000).1","Income_Lower middle (15,000-30,000)","Income_Lower middle (15,000-30,000).1","Income_Lower middle (15,000-30,000).2","Income_Upper middle (30,000-50,000)","Income_Upper middle (30,000-50,000).1"
0,4.17,4.84,3,3.22,3.35,False,True,False,True,False,...,False,False,False,True,False,False,False,False,False,False
1,4.92,5.0,3,3.467,3.467,True,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
2,5.0,4.83,4,4.0,3.72,False,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
3,4.0,4.5,5,3.8,3.75,False,True,True,False,False,...,True,False,False,False,False,False,False,False,False,False
4,2.19,3.17,3,3.94,3.94,True,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [8]:
df.columns

Index(['HSC', 'SSC', 'English', 'Last', 'Overall', 'Gender_Female',
       'Gender_Male', 'Hometown_City', 'Hometown_Village', 'Computer_1',
       'Computer_2', 'Computer_3', 'Computer_4', 'Computer_5',
       'Preparation_0-1 Hour', 'Preparation_2-3 Hours',
       'Preparation_More than 3 Hours', 'Gaming_0-1 Hour', 'Gaming_2-3 Hours',
       'Gaming_More than 3 Hours', 'Attendance_40%-59%', 'Attendance_60%-79%',
       'Attendance_80%-100%', 'Attendance_Below 40%', 'Job_No', 'Job_Yes',
       'Extra_No', 'Extra_Yes', 'Semester_10th', 'Semester_11th',
       'Semester_12th', 'Semester_2nd', 'Semester_3rd', 'Semester_4th',
       'Semester_5th', 'Semester_6th', 'Semester_7th', 'Semester_8th',
       'Semester_9th', 'Department_Business Administration',
       'Department_Computer Science and Engineering', 'Department_Economics',
       'Department_Electrical and Electronic Engineering',
       'Department_English',
       'Department_Journalism, Communication and Media Studies',
      

In [10]:
# Data Preprocessing - Scaling Numerical Features
## List of numerical features to scale (excluding one-hot encoded features)
numerical_features_to_scale = ['HSC', 'SSC', 'Last', 'Overall', 'English']

scaler = StandardScaler() # Initialize StandardScaler

# Fit and transform the numerical features
df[numerical_features_to_scale] = scaler.fit_transform(df[numerical_features_to_scale])

df[numerical_features_to_scale].head() # Display the first few rows of scaled numerical features

Unnamed: 0,HSC,SSC,Last,Overall,English
0,0.024224,0.207206,0.087562,0.27337,-0.655427
1,1.397978,0.665419,0.473003,0.471273,-0.655427
2,1.544511,0.178568,1.304745,0.899215,0.494486
3,-0.28716,-0.766495,0.992647,0.949959,1.644398
4,-3.602485,-4.575387,1.211116,1.271339,-0.655427


In [12]:
df[numerical_features_to_scale].describe() # Display descriptive statistics of scaled features

Unnamed: 0,HSC,SSC,Last,Overall,English
count,493.0,493.0,493.0,493.0,493.0
mean,-2.882526e-17,-2.882526e-17,-1.153011e-16,0.0,2.1618950000000002e-17
std,1.001016,1.001016,1.001016,1.001016,1.001016
min,-3.639119,-5.062237,-3.37673,-3.701592,-2.955251
25%,-0.5985442,-0.2510061,-0.55224,-0.521622,-0.6554267
50%,0.02422413,0.4935892,0.1343764,0.138052,0.4944856
75%,0.6286757,0.6654189,0.7897829,0.831556,0.4944856
max,1.544511,0.6654189,1.304745,1.372828,1.644398
