In [2]:
!pip install pymysql mysql-connector-python
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import os
import mysql.connector



In [3]:
username = 'test'
password = '123456789'
host = 'localhost' 
port = '3306'      
database = 'sphinx'

In [4]:
connection = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    database=database
)
cursor = connection.cursor()

# Get a list of all tables in the database
cursor.execute("SHOW TABLES;")
tables = cursor.fetchall()

# Iterate over each table and delete all rows
for (table_name,) in tables:
    try:
        # Delete or truncate the table
        truncate_query = f"TRUNCATE TABLE {table_name};"
        cursor.execute(truncate_query)
        print(f"Table '{table_name}' has been truncated.")
    except Exception as e:
        print(f"Failed to truncate table '{table_name}': {e}")

# Commit the changes
connection.commit()

# Close the cursor and connection
cursor.close()
connection.close()

Table 'badges' has been truncated.
Table 'comments' has been truncated.
Table 'linktypes' has been truncated.
Table 'postlinks' has been truncated.
Table 'posts' has been truncated.
Table 'posttypes' has been truncated.
Table 'users' has been truncated.
Table 'votes' has been truncated.
Table 'votetypes' has been truncated.


In [6]:
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}:{port}/{database}')
for i in [1,2,3]:
    folder_path = "../data_"+str(i)
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            table_name=(file_name.split(".")[0]).lower()+str(i)
            print(f"Reading file: {file_name}")
            file_size = os.path.getsize(file_path)  # Size in bytes 
            
            if file_size / (1024 ** 2)>15:
                chunksize = 10000  
                try:
                    with engine.connect() as connection:
                        print(f"Dataframe will be inserted in chunks: {file_name}")
                        for chunk in pd.read_csv(file_path, chunksize=chunksize, index_col=False):
                            chunk = chunk.iloc[:, ~chunk.columns.str.contains('^Unnamed')]
                            chunk.to_sql(name=table_name, con=engine, if_exists='append', index=False)
                            
                except Exception as e:
                    print(f"An error occurred: {e}")
                    # Explicit rollback if the connection supports transactions
                    connection.rollback()
                finally:
                    engine.dispose()
            else:
                df = pd.read_csv(file_path, index_col=False)
                df = df.iloc[:, ~df.columns.str.contains('^Unnamed')]
                print(f"Dataframe ready for MySQL: {file_name}")
                
                try:
                    with engine.connect() as connection:
                        df.to_sql(table_name, con=connection, if_exists='replace', index=False)
                except Exception as e:
                    print(f"An error occurred: {e}")
                    # Explicit rollback if the connection supports transactions
                    connection.rollback()
                finally:
                    engine.dispose()
                

Reading file: Comments.csv
Dataframe will be inserted in chunks: Comments.csv
Reading file: Posts.csv
Dataframe will be inserted in chunks: Posts.csv
Reading file: Users.csv
Dataframe will be inserted in chunks: Users.csv
Reading file: Comments.csv
Dataframe will be inserted in chunks: Comments.csv
Reading file: Posts.csv
Dataframe will be inserted in chunks: Posts.csv
Reading file: Users.csv
Dataframe ready for MySQL: Users.csv
Reading file: Comments.csv
Dataframe will be inserted in chunks: Comments.csv
Reading file: Posts.csv
Dataframe will be inserted in chunks: Posts.csv
Reading file: Users.csv
Dataframe ready for MySQL: Users.csv


You should install sphinx and put the binaries folder in this folder.

In [7]:
import subprocess

result = subprocess.run(
    ["sphinx-3.7.1\\bin\\indexer", "--config", "sphinx-min.conf.dist", "--all"],
    capture_output=True,
    text=True
)
print(result.stdout) 
print(result.stderr) 


Sphinx 3.7.1-dev (commit da9f8a4e7)
Copyright (c) 2001-2024, Andrew Aksyonoff
Copyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)

using config file 'sphinx-min.conf.dist'...
indexing index 'comments1'...
collected 1373756 docs, 203.7 MB
sorted 37.2 Mhits, 100.0% done
total 1373756 docs, 203.7 Mb
total 24.8 sec, 8.198 Mb/sec, 55290 docs/sec
indexing index 'posts1'...
collected 1565425 docs, 1101.4 MB
sorted 183.3 Mhits, 100.0% done
total 1565425 docs, 1.101 Gb
total 121.1 sec, 9.098 Mb/sec, 12930 docs/sec
indexing index 'users1'...
collected 99869 docs, 9.2 MB
sorted 1.5 Mhits, 100.0% done
total 99869 docs, 9.171 Mb
total 1.5 sec, 6.199 Mb/sec, 67507 docs/sec
indexing index 'usersjoincomments1'...
collected 831529 docs, 215.5 MB
sorted 37.5 Mhits, 100.0% done
total 831529 docs, 215.5 Mb
total 35.4 sec, 6.088 Mb/sec, 23489 docs/sec
indexing index 'usersjoinposts1'...
collected 948342 docs, 770.1 MB
sorted 128.6 Mhits, 100.0% done
total 948342 docs, 770.1 Mb
total 