In [1]:
import pandas as pd

In [2]:
def read_csv(file_path):
    data=pd.read_csv(file_path)
    return data

def read_json(file_path):
    data=pd.read_json(file_path)
    return data

def read_excel(file_path):
    data=pd.read_excel(file_path)
    return data

In [3]:
import os

def read_data(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension == '.csv':
        return read_csv(file_path)
    elif file_extension == '.json':
        return read_json(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return read_excel(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_data(data):
    # Handle missing values
    data = data.fillna(0)
    
    # Convert data types if necessary
    data['Rank'] = data['Rank'].astype(int)
    
    # Encoding categorical variables
    data = pd.get_dummies(data, columns=['Country', 'Country Code'], drop_first=True)
    
    # Scaling numerical features
    scaler = StandardScaler()
    num_cols = ['Rank', 'Gold', 'Silver', 'Bronze', 'Total']
    data[num_cols] = scaler.fit_transform(data[num_cols])
    
    print("Processed data shape:", data.shape)  # Debug: Check shape
    print(data.head())  # Debug: Check sample data
    
    return data


In [5]:
def descriptive_statistics(data):
    return data.describe()

In [6]:

def correlation_analysis(data):
    numeric_data = data.select_dtypes(include=[float, int])
    return numeric_data.corr()

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def linear_regression_analysis(data, target_column):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Ensure X and y are not empty
    if X.empty or y.empty:
        print("Error: Data for training is empty.")
        return None, None
    
    X = X.apply(pd.to_numeric, errors='coerce').dropna()
    y = pd.to_numeric(y, errors='coerce')[X.index]
    
    # Ensure that data is not empty after conversion
    if X.empty or y.empty:
        print("Error: Data for training is empty after conversion.")
        return None, None
    
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    if X_train.empty or X_test.empty or y_train.empty or y_test.empty:
        print("Error: One of the training or test sets is empty.")
        return None, None
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return model, mse



In [8]:
from sklearn.cluster import KMeans


def kmeans_clustering(data, n_clusters):
    from sklearn.cluster import KMeans
    X = data.select_dtypes(include=[float, int])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clustered_data = data.copy()
    clustered_data['Cluster'] = kmeans.fit_predict(X)
    return kmeans, clustered_data


In [9]:
from sklearn.decomposition import PCA
import pandas as pd

def perform_pca(data):
    from sklearn.decomposition import PCA
    X = data.select_dtypes(include=[float, int])
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    explained_variance = pca.explained_variance_ratio_
    pca_results = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    return pca_results, explained_variance

In [10]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

def plot_descriptive_statistics(data):
    num_columns = data.select_dtypes(include=['number']).columns
    for col in num_columns:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(data[col], kde=True)
        plt.title(f'Histogram of {col}')
        
        plt.subplot(1, 2, 2)
        sns.boxplot(x=data[col])
        plt.title(f'Boxplot of {col}')
        
        plt.tight_layout()
        plt.savefig(f'{col}_plots.png')
        plt.close()

def plot_correlation_matrix(data):
    numeric_data = data.select_dtypes(include=['number'])
    
    if numeric_data.empty:
        print("No numeric columns available for correlation analysis.")
        return
    
    plt.figure(figsize=(12, 10))
    correlation_matrix = numeric_data.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.savefig('correlation_matrix.png')
    plt.close()


def plot_clusters(clustered_data):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x=clustered_data['Gold'], y=clustered_data['Silver'], hue=clustered_data['Cluster'], palette='viridis')
    plt.title('K-Means Clustering Results')
    plt.xlabel('Gold Medals')
    plt.ylabel('Silver Medals')
    plt.savefig('kmeans_clusters.png')
    plt.close()

def plot_pca(pca_results, filename):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=pca_results['PC1'], y=pca_results['PC2'])
    plt.title('PCA Results')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()



In [11]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch

def generate_pdf_report(filename, plots):
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter
    
    # Title
    c.setFont("Helvetica-Bold", 16)
    c.drawString(1 * inch, height - 1 * inch, "Data Analysis Report")
    
    # Add each plot to a separate page
    for title, plot_filename in plots.items():
        # Add a new page
        c.showPage()
        
        # Title for the plot
        c.setFont("Helvetica-Bold", 12)
        c.drawString(1 * inch, height - 1 * inch, title)
        
        # Add the plot image
        c.drawImage(plot_filename, 1 * inch, height - 5 * inch, width=6 * inch, height=4 * inch)
    
    c.save()


In [12]:
def analysis_pipeline(file_path, target_column):
    # Ensure target_column is set for ML models
    if not target_column:
        print("Target column is not specified for ML models.")
        return

    data=read_data(file_path)
    
    data=preprocess_data(data)
    
    print("Descriptive Statistics:")
    desc_stats = descriptive_statistics(data)
    print(desc_stats)
    
    # Plot Descriptive Statistics
    plot_descriptive_statistics(data)
    
    # Correlation Analysis
    print("\nCorrelation Analysis:")
    correlation_matrix = correlation_analysis(data)
    print(correlation_matrix)
    
    # Plot Correlation Matrix
    plot_correlation_matrix(data)
    
    # Linear Regression Analysis
    print("\nLinear Regression Analysis:")
    model, mse = linear_regression_analysis(data, target_column)
    print(f"Mean Squared Error: {mse}")
    
    # K-Means Clustering
    print("\nK-Means Clustering:")
    kmeans_model, clustered_data = kmeans_clustering(data, n_clusters=3)  # Example: 3 clusters
    print(clustered_data.head())
    
    # Plot Clusters
    plot_clusters(clustered_data)
    
    # Decision Tree Regression
    print("\nPrinciple Component Analysis:")
    dt_model, mse = perform_pca(data)
    print(f"Mean Squared Error: {mse}")
    
    pca_results, explained_variance = perform_pca(data)
    plot_pca(pca_results, 'pca_plot.png')

    plots = {
        'Descriptive Statistics Plots': 'Total_plots.png',
        'Correlation Matrix': 'correlation_matrix.png',
        'K-Means Clustering Results': 'kmeans_clusters.png',
        'PCA Results': 'pca_plot.png'
    }

    generate_pdf_report('data_analysis_report.pdf', plots)

