In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')
%matplotlib inline

In [None]:
# Load dataset
try:
    df = pd.read_csv(
        'instagram_usage_lifestyle.csv',
        encoding='cp1252',
        engine='python',
        on_bad_lines='skip'
    )
    print('Dataset loaded successfully.')
    display(df.head())
except FileNotFoundError:
    print('Error: The file "instagram_usage_lifestyle.csv" was not found. Please ensure it is uploaded to the /content/ directory.')

In [1]:
# Check missing values
if 'df' in globals():
    print('Missing values in each column:')
    display(df.isna().sum())
else:
    print('DataFrame "df" is not defined. Please run the data loading cell.')

DataFrame "df" is not defined. Please run the data loading cell.


In [2]:
# Check duplicate rows
if 'df' in globals():
    duplicates = df.duplicated().sum()
    print(f'Number of duplicate rows: {duplicates}')

In [3]:
# Dataset shape
if 'df' in globals():
    print(f'Shape (rows, columns): {df.shape}')
    print(f'Number of rows: {df.shape[0]}')
    print(f'Number of columns: {df.shape[1]}')

In [4]:
# Data types
if 'df' in globals():
    display(df.dtypes)

In [5]:
# Summary statistics
if 'df' in globals():
    display(df.describe(include='all'))

In [6]:
# Age Distribution
if 'df' in globals() and 'age' in df.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(df['age'].dropna(), bins=20, kde=True)
    plt.title('Distribution of Age')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()
else:
    print('Column "age" not found or DataFrame not loaded.')

Column "age" not found or DataFrame not loaded.


In [7]:
# Average Engagement Score by Gender
if 'df' in globals() and 'gender' in df.columns and 'user_engagement_score' in df.columns:
    gender_engagement = df.groupby('gender')['user_engagement_score'].mean().sort_values(ascending=False)
    plt.figure(figsize=(8,5))
    gender_engagement.plot(kind='bar')
    plt.title("Average Engagement Score by Gender")
    plt.ylabel("Engagement Score")
    plt.show()
    display(gender_engagement)
else:
    print('Required columns for Gender Engagement not found or DataFrame not loaded.')

Required columns for Gender Engagement not found or DataFrame not loaded.


In [8]:
# Top 10 Countries by Engagement Score
if 'df' in globals() and 'country' in df.columns and 'user_engagement_score' in df.columns:
    country_engagement = df.groupby('country')['user_engagement_score'].mean().sort_values(ascending=False)
    plt.figure(figsize=(12,5))
    country_engagement.head(10).plot(kind='bar')
    plt.title("Top 10 Countries by Engagement Score")
    plt.ylabel("Engagement Score")
    plt.show()
    display(country_engagement.head(10))
else:
    print('Required columns for Country Engagement not found or DataFrame not loaded.')

Required columns for Country Engagement not found or DataFrame not loaded.


In [9]:
# Followers Count vs Engagement Score
if 'df' in globals() and 'followers_count' in df.columns and 'user_engagement_score' in df.columns:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x='followers_count', y='user_engagement_score', data=df)
    plt.title("Followers Count vs Engagement Score")
    plt.xlabel("Followers Count")
    plt.ylabel("Engagement Score")
    plt.show()
else:
    print('Required columns for Correlation Scatterplot not found or DataFrame not loaded.')

Required columns for Correlation Scatterplot not found or DataFrame not loaded.


In [10]:
# Correlation Matrix
if 'df' in globals():
    plt.figure(figsize=(10,8))
    numeric_cols = df.select_dtypes(include=[np.number])
    if not numeric_cols.empty:
        sns.heatmap(numeric_cols.corr(), cmap='coolwarm', annot=True, fmt='.2f')
        plt.title("Correlation Matrix of Numeric Features")
        plt.show()
    else:
        print('No numeric columns available for correlation.')
else:
    print('DataFrame not loaded.')

DataFrame not loaded.


In [11]:
# Monthly Login Activity
if 'df' in globals() and 'last_login_date' in df.columns:
    df['last_login_date'] = pd.to_datetime(df['last_login_date'], errors='coerce')
    df_login = df.dropna(subset=['last_login_date'])
    df_login['Month'] = df_login['last_login_date'].dt.to_period('M')
    monthly_users = df_login.groupby('Month').size()

    plt.figure(figsize=(10,5))
    monthly_users.plot(marker='o')
    plt.title("Monthly Login Activity")
    plt.ylabel("Number of Logins")
    plt.grid(True)
    plt.show()
else:
    print('Column "last_login_date" not found or DataFrame not loaded.')

Column "last_login_date" not found or DataFrame not loaded.
