In [1]:
#Importing necessary libraries 
import mysql.connector
from mysql.connector import Error
import pandas as pd
import concurrent.futures
import time 
import logging

### 1. Establishing a connection between python and MySQL

In [2]:
#Establishing mysql - python connection
db_configuration = {
    'host': 'localhost',
    'database': 'sports_database',
    'user': 'root',
    'password': '123mysql.',
    'port': '3307'
}

#Function to connect to the MySQL database
def connect_to_database():
    try:
        connection = mysql.connector.connect(**db_configuration)
        if connection.is_connected():
            print("Connected to MySQL database")
            return connection
    except Error as e:
        print(f"Error connecting to MySQL database: {e}")

connect_to_database()

Connected to MySQL database


<mysql.connector.connection_cext.CMySQLConnection at 0x1837f3dacd0>

### 2. Ingestion of data and logging

In [3]:
import concurrent.futures
import time

#logging configuration
logging.basicConfig(level=logging.INFO, filename='data_ingestion.log', filemode='a', format='%(asctime)s - %(levelname)s - %(message)s')

db_config = {
    'host': 'localhost',
    'user': 'root',
    'database': 'sports_database',
    'password': '123mysql.',
    'port': '3307'
}

conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
    
cursor.execute('''
        CREATE TABLE sports_dataset (Sl_no Integer PRIMARY KEY,Unnamed INTEGER, Player VARCHAR(255) NOT NULL,Team VARCHAR(255), Age INTEGER,Height REAL,Weight REAL,Position VARCHAR(255),Goals INTEGER,Assists INTEGER,YellowCards INTEGER,RedCards INTEGER,PassCompletionRate REAL,DistanceCovered REAL,Sprints INTEGER,ShotsOnTarget INTEGER,TacklesWon INTEGER,CleanSheets INTEGER,PlayerFatigue REAL,MatchPressure INTEGER,InjuryHistory INTEGER,TrainingHours REAL,FatigueInjuryCorrelation REAL,PressurePerformanceImpact REAL,EffectiveTraining REAL,Season INTEGER,EffectiveTrainingHours	INTEGER,PerformanceRatio	INTEGER,PredictedPerformanceCategory VARCHAR(255),FatiguePressureInteraction	INTEGER,PredictedInteractionCategory VARCHAR(255))
    ''')
#Creation of index on the column positions for faster querying on that column.
cursor.execute("SHOW INDEX FROM sports_dataset WHERE Key_name = 'idx_position'")
index_exists = cursor.fetchone()
if not index_exists:
    cursor.execute('CREATE INDEX idx_position ON sports_dataset (Team)')
#Commiting the transaction
conn.commit()
conn.close()
print("Database and table setup complete.")

Database and table setup complete.


### 3. Incremental Loading

In [4]:
def load_new_data(csv_file):
    logging.info('Loading new data from CSV file.')
    
    #Loading the new data from the CSV file
    new_data = pd.read_csv(csv_file)
    conn = mysql.connector.connect(**db_configuration)
    cursor = conn.cursor()
    #Get existing indices from the database
    cursor.execute("SELECT Sl_no FROM sports_dataset")
    existing_indexes = [row[0] for row in cursor.fetchall()]
    #Filtering out records that already exist in the database
    new_records = new_data[~new_data['Sl_no'].isin(existing_indexes)]
    logging.info(f'Found {len(new_records)} new records to insert.')
    #Inserting new records into the database
    for _, row in new_records.iterrows():
        cursor.execute('''
            INSERT INTO sports_dataset (
                Sl_no, Unnamed, Player, Team, Age, Height, Weight,
       Position, Goals, Assists, YellowCards, RedCards,
       PassCompletionRate, DistanceCovered,Sprints, ShotsOnTarget,
       TacklesWon, CleanSheets, PlayerFatigue, MatchPressure,
       InjuryHistory, TrainingHours, FatigueInjuryCorrelation,
       PressurePerformanceImpact, EffectiveTraining, Season,
       EffectiveTrainingHours, PerformanceRatio,
       PredictedPerformanceCategory, FatiguePressureInteraction,
       PredictedInteractionCategory) 
       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ''', tuple(row))
    conn.commit()
    conn.close()
    
    logging.info(f'{len(new_records)} new records inserted.')

load_new_data('cleaned_sports_dataset.csv')