# 数据清洗，上传到MySQL

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from config import MySQLConfig

db = MySQLConfig()
engine_str = f"mysql+pymysql://{db.user}:{db.password}@{db.host}:{db.port}/{db.database}?charset={db.charset}"
engine = create_engine(engine_str)
print("MySQL连接成功")

## customer.csv

In [None]:
file_path = "./data/customer.csv"
df = pd.read_csv(file_path)

print("数据概览：")
display(df.head())
print("\n数据类型信息：")
print(df.info())

### 类型转换

In [None]:
# 日期类型转换
df["近期合作月份_time"] = pd.to_datetime(df["近期合作月份_time"], errors="coerce")
df["近期合作日期_time"] = pd.to_datetime(df["近期合作日期_time"], errors="coerce")
# 数值字段转 float
num_cols = ["运单数", "业务量", "体积", "计费重量", "收益"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
print("缺失值统计：")
display(df.isna().sum())
print("数据概览：")
display(df.head())
print("\n数据类型信息：")
print(df.info())


### 处理缺失值

### 上传数据库

In [None]:
df.to_sql(
    name="customer",
    con=engine,
    if_exists="replace",
    index=False,
    chunksize=1000
)

print(f"已成功上传 {len(df)} 条数据至表 customer_info")


In [None]:
sql = """SELECT COUNT(*)FROM DeliverInsight.customer"""
with engine.connect() as conn:
    result = pd.read_sql(sql, conn)
display(result)

## order.csv

In [None]:
file_path = "./data/order.csv"
df = pd.read_csv(file_path)

print("数据概览：")
display(df.head())
print("\n数据类型信息：")
print(df.info())


In [None]:

# 转换日期字段
date_cols = ["收入月份_time", "录入时间_time", "录入日期_time", "签字日期_time"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# 数值字段
num_cols = ["件数", "毛重", "体积", "计费重量", "收入金额", "lat", "lng"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

print("缺失值统计：")
display(df.isna().sum())

print("清洗后数据概览：")
display(df.head())
print("\n数据类型信息：")
print(df.info())




In [None]:
table_name = "order"
try:
    with engine.begin() as conn:
        df.to_sql(
            name=table_name,
            con=conn,
            if_exists="replace",  # 首次上传用 replace，之后改 append
            index=False,
            chunksize=1000
        )
    print(f"已成功上传 {len(df)} 条数据至表 `order`")
except Exception as e:
    print("上传失败：", e)

try:
    sql = f"SELECT COUNT(*) AS total_rows FROM `order`"
    with engine.connect() as conn:
        result = pd.read_sql(sql, conn)
    print("数据库中记录数：")
    display(result)
except Exception as e:
    print("查询失败：", e)