In [None]:
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
election_df = pd.read_csv('/content/raw_data_election2024.csv')  # Replace with actual file path

# Display column names to verify 'Age_Group' and 'Candidate_Preference' are present
print("Column Names:", election_df.columns)

# Data Overview
print("\nDataset Info:")
print(election_df.info())
print("\nBasic Statistics:")
print(election_df.describe())

# Drop rows with any missing values
election_df = election_df.dropna()

# Check if 'Age_Group' and 'Candidate_Preference' columns exist
if 'Age_Group' in election_df.columns and 'Candidate_Preference' in election_df.columns:
    # Candidate preference by age group
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=election_df, x='Age_Group', y='Candidate_Preference', palette='Set2')
    plt.title('Candidate Preference by Age Group')
    plt.xlabel('Age Group')
    plt.ylabel('Candidate Preference (%)')
    plt.show()
else:
    print("Columns 'Age_Group' and/or 'Candidate_Preference' not found for age group analysis.")

# Check if 'Region' and 'Candidate' columns exist for regional breakdown
if 'Region' in election_df.columns and 'Candidate' in election_df.columns:
    plt.figure(figsize=(12, 7))
    sns.countplot(data=election_df, x='Region', hue='Candidate', palette='Set1')
    plt.title('Candidate Support by Region')
    plt.xlabel('Region')
    plt.ylabel('Count')
    plt.show()
else:
    print("Columns 'Region' and/or 'Candidate' not found for regional analysis.")

# Trend Analysis (if 'Date' and 'Candidate_Preference' columns exist)
if 'Date' in election_df.columns and 'Candidate_Preference' in election_df.columns:
    election_df['Date'] = pd.to_datetime(election_df['Date'])
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=election_df, x='Date', y='Candidate_Preference', hue='Candidate')
    plt.title('Trends in Candidate Preference Over Time')
    plt.xlabel('Date')
    plt.ylabel('Candidate Preference (%)')
    plt.show()
else:
    print("Columns 'Date' and/or 'Candidate_Preference' not found for trend analysis.")

# Overall Candidate Preference
if 'Candidate' in election_df.columns and 'Candidate_Preference' in election_df.columns:
    plt.figure(figsize=(8, 5))
    sns.barplot(data=election_df, x='Candidate', y='Candidate_Preference', estimator=np.mean, ci=None, palette='plasma')
    plt.title('Overall Candidate Preference')
    plt.ylabel('Average Preference (%)')
    plt.show()
else:
    print("Columns 'Candidate' and/or 'Candidate_Preference' not found for overall candidate preference analysis.")


Column Names: Index(['ID', 'SEX', 'AGE', 'FO', 'TIP', 'Q1', 'Q2', 'Q6', 'TV', 'INETR', 'EDU',
       'DOHOD_0', 'PROF', 'PROF2', 'PROF3', 'weight1'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       1600 non-null   object 
 1   SEX      1600 non-null   object 
 2   AGE      1600 non-null   float64
 3   FO       1600 non-null   object 
 4   TIP      1600 non-null   object 
 5   Q1       1600 non-null   object 
 6   Q2       1600 non-null   object 
 7   Q6       1600 non-null   object 
 8   TV       1600 non-null   object 
 9   INETR    1600 non-null   object 
 10  EDU      1600 non-null   object 
 11  DOHOD_0  1600 non-null   object 
 12  PROF     1600 non-null   object 
 13  PROF2    692 non-null    object 
 14  PROF3    879 non-null    object 
 15  weight1  1600 non-null   float64
dtypes: float64(2), ob