In [66]:
import pandas as pd
import numpy as np
import atexit
from db_service_sqlalchemy import create_connection, close_connection
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Boolean, DateTime, update,text
from sqlalchemy.exc import SQLAlchemyError
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from lsoa_sa_boundaries import *


In [87]:
def update_player(df,conn,table_destination):
    '''
    Parameters
    ----------
    df : dataframe to upsert into database-> table player

    Returns
    -------
    None.

    '''

    df.to_sql('table_travail',conn,if_exists='replace',index=False)
                              
    conn.execute(f"""
                 INSERT INTO {table_destination}
                 SELECT *
                 FROM table_travail
                 ON CONFLICT ON CONSTRAINT PRIMARY 
                 DO UPDATE
                 SET LSOAcode=EXCLUDED.LSOAcode,
                 LSOAname=EXCLUDED.LSOAname,
                 """)
    conn.commit()
    

In [88]:
def connect_maria(db_name:str, user="root", pwd="", host="localhost", port="3306"):
    """
    Connecte une base de données MariaDB via SQLalchemy
    """
    return create_engine(
        f"mariadb+pymysql://{user}:{pwd}@{host}:{port}/{db_name}")


In [89]:
print(db_name)

crime_short


In [90]:


def update_table_with_df(table_name, table_fields, df, df_columns, engine, session, table_key_field, df_key_column):
    """
    Met à jour une table de la base de données avec les données d'un DataFrame.

    :param table_name: Nom de la table à mettre à jour
    :param table_fields: Liste des champs de la table à mettre à jour
    :param df: DataFrame contenant les données
    :param df_columns: Liste des colonnes du DataFrame correspondant aux champs de la table
    :param engine: SQLAlchemy engine
    :param session: SQLAlchemy session
    :param table_key_field: Champ de la table utilisé pour identifier les lignes
    :param df_key_column: Colonne du DataFrame utilisée pour identifier les lignes
    """
    metadata = MetaData()
    table = Table(table_name, metadata, autoload_with=engine)
    compteur=0
    for index, row in df.iterrows():
        stmt = (
            update(table).
            where(table.c[table_key_field] == row[df_key_column]).
            values({table_fields[i]: row[df_columns[i]] for i in range(len(table_fields))})
        )
        session.execute(stmt)
        compteur+=1
        if compteur==1000:
            print("1000 mise à jour")
            compteur=0

    session.commit()

def update_and_add_columns(table_name, table_fields, df, df_columns, engine, session, table_key_field, df_key_column, new_columns):
    """
    Met à jour une table de la base de données avec les données d'un DataFrame et ajoute de nouvelles colonnes.

    :param table_name: Nom de la table à mettre à jour
    :param table_fields: Liste des champs de la table à mettre à jour
    :param df: DataFrame contenant les données
    :param df_columns: Liste des colonnes du DataFrame correspondant aux champs de la table
    :param engine: SQLAlchemy engine
    :param session: SQLAlchemy session
    :param table_key_field: Champ de la table utilisé pour identifier les lignes
    :param df_key_column: Colonne du DataFrame utilisée pour identifier les lignes
    :param new_columns: Dictionnaire des nouvelles colonnes à ajouter (nom_colonne: type_colonne)
    """
    metadata = MetaData(bind=engine)
    table = Table(table_name, metadata, autoload_with=engine)

    # Ajouter les nouvelles colonnes à la table
    for col_name, col_type in new_columns.items():
        if col_name not in table.columns:
            col = Column(col_name, col_type)
            col.create(table, populate_default=True)

    for index, row in df.iterrows():
        stmt = (
            update(table).
            where(table.c[table_key_field] == row[df_key_column]).
            values({table_fields[i]: row[df_columns[i]] for i in range(len(table_fields))})
        )
        session.execute(stmt)

    session.commit()


# Connexion à la base de données
db_name="crime_short"
conn, engine = create_connection("127.0.0.1", "root", "", db_name)

# Enregistrer la fonction de nettoyage pour qu'elle soit appelée à la sortie
atexit.register(lambda:close_connection(conn,engine))


Connexion réussie à la base de données MariaDB


<function __main__.<lambda>()>

In [91]:

print("Connexion à la base de données réussie")


# Exemple de requête SQL
query = "SELECT * FROM outcomes_temp"
df1 = pd.read_sql(query, conn)

query = "SELECT * FROM stopandsearch_temp"
df2 = pd.read_sql(query, conn)
df2['LSOAcode'] = None
df2['LSOAname'] = None


query= "SELECT * FROM street_temp"
df3 = pd.read_sql(query, conn)


print('vérification des données avant intégration')
print(len(df1))
print(len(df2))
print(len(df3))
lsoa_file='LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Best_Fit_Lookup_for_EW_(V2).csv'
sa_file='Look-up Tables_0.xlsx'



Connexion à la base de données réussie
vérification des données avant intégration
405433
87407
937274


In [92]:
df1.columns

Index(['CrimeID', 'Month', 'Reportedby', 'Longitude', 'Latitude', 'Location',
       'LSOAcode', 'LSOAname', 'Outcometype', 'anneemois', 'infogeo', 'id'],
      dtype='object')

In [93]:



lsoa_data=pd.read_csv(lsoa_file)
lsoa_data=lsoa_data[['LSOA11CD','LSOA11NM']]

sa_data=pd.read_excel(sa_file,sheet_name=2, engine='openpyxl')
sa_data=sa_data[['SA2011','SA2011NAME']]
sa_data['SA2011NAME']=sa_data['SA2011NAME'].str.extract(r'\((.*?)\)')


In [94]:
# =============================================================================
# step 1 take row with latitude and no lsoa and fill lsoa if possible in df1
# =============================================================================
mask = (df1['LSOAcode'].isnull()) & (df1['Latitude'].notnull())
# Use the mask with .loc to keep the original indices
missing_lsoa_df1 = df1.loc[mask]
# missing_lsoa_df1 = df1[df1['LSOA code'].isnull()] # si code = null, name aussi
# missing_lsoa_df1 = missing_lsoa_df1[missing_lsoa_df1['Latitude'].notnull()] # si longitude = ok, latitude aussi
valid_lsoa_df1 = df1[df1['LSOAcode'].notnull()] # si code = null, name aussi
valid_lsoa_df1 = valid_lsoa_df1[valid_lsoa_df1['Latitude'].notnull()] # si longitude = ok, latitude aussi
valid_lsoa_df1 = valid_lsoa_df1.drop_duplicates(subset=['Latitude', 'Longitude', 'LSOAcode'])


# Prepare coordinates for KNN
valid_coords = valid_lsoa_df1[['Latitude', 'Longitude']].values
missing_coords = missing_lsoa_df1[['Latitude', 'Longitude']].values


knn = NearestNeighbors(n_neighbors=5, metric='euclidean')  # Euclidean distance for geographical proximity
knn.fit(valid_coords)
threshold=0.02

distances, indices = knn.kneighbors(missing_coords)

# Step 4: Extract the LSOAcodes for the nearest neighbors
nearest_lsoa_codes = valid_lsoa_df1['LSOAcode'].values[indices]


def most_common_lsoa(codes):
    # Use Counter to count occurrences of each LSOAcode
    if len(codes) > 0:
        counts = Counter(codes)
        # Get the most common LSOAcode
        most_common = counts.most_common()
        # Check for ties
        highest_count = most_common[0][1]  # Get the highest frequency
        candidates = [code for code, count in most_common if count == highest_count]
        # Return the first one (the left-most in case of a draw)
        return candidates[0]
    return None  # In case there are no codes

# Apply the function to find the most common LSOAcode for each row
# =============================================================================
# missing_lsoa_df1['LSOAcode'] = [most_common_lsoa(row) for row in nearest_lsoa_codes]
# =============================================================================
missing_lsoa_df1['LSOAcode'] = [
    most_common_lsoa(valid_lsoa_df1['LSOAcode'].values[indices[i][distances[i] < threshold]]) 
    for i in range(len(missing_coords))
]

missing_lsoa_df1['original_index'] = missing_lsoa_df1.index
missing_lsoa_df1 = missing_lsoa_df1.merge(lsoa_data, left_on='LSOAcode', right_on='LSOA11CD', how='left')
missing_lsoa_df1['LSOAname'] = missing_lsoa_df1['LSOAname'].fillna(missing_lsoa_df1['LSOA11NM'])
missing_lsoa_df1 = missing_lsoa_df1.drop(['LSOA11CD', 'LSOA11NM'], axis=1)
missing_lsoa_df1 = missing_lsoa_df1[missing_lsoa_df1['LSOAcode']!='']
missing_lsoa_df1.set_index('original_index', inplace=True)

df1.update(missing_lsoa_df1)
print(df1.head())




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df1['LSOAcode'] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df1['original_index'] = missing_lsoa_df1.index


                                             CrimeID    Month  \
0  4b6598fccb7993fdb6bdaaadbe13a976f8ab18abe151fa...  2019-11   
1  0214adc5f554debfd007146378315c7c3817dfbcf4bdb0...  2019-11   
2  4bfa0dc6589a256c8519e521f2c2d61b48071a345e5823...  2019-11   
3  ee9f4c4dafc480cd35e7dfe37f61e5cde90dd683748dd4...  2019-11   
4  c4322ca5564e31c2498d02f4d0b0169bba8bb2fc7247bf...  2019-11   

                       Reportedby  Longitude  Latitude  \
0  Avon and Somerset Constabulary        NaN       NaN   
1  Avon and Somerset Constabulary   -2.59270   51.5001   
2  Avon and Somerset Constabulary        NaN       NaN   
3  Avon and Somerset Constabulary        NaN       NaN   
4  Avon and Somerset Constabulary   -2.32222   51.2303   

                     Location   LSOAcode      LSOAname  \
0                 No location       None          None   
1  On or near PENRITH GARDENS  E01014631  Bristol 004A   
2                 No location       None          None   
3                 No locatio

In [95]:

# =============================================================================
# step 3 take row with latitude and no lsoa and fill lsoa if possible in df3
# =============================================================================

mask = (df3['LSOAcode'].isnull()) & (df3['Latitude'].notnull())
missing_lsoa_df3=df3.loc[mask]

In [96]:

centroid_lsoa=[[get_lsoa_centroid(x,lsoa_geojson),x] for x in lsoa_data['LSOA11CD']]
# centroid_sa=[[get_sa_centroid(x,sa_geojson),x] for x in sa_data['SA2011']]

centroid_lsoa = [(lat_lon[0], lat_lon[1], code) for (lat_lon, code) in centroid_lsoa]
centroid_lsoa = pd.DataFrame(centroid_lsoa, columns=['lat', 'lon', 'LSOAcode'])

# centroid_sa = [(lat_lon[0], lat_lon[1], code) for (lat_lon, code) in centroid_sa if lat_lon is not None]
# centroid_sa = pd.DataFrame(centroid_sa, columns=['lat', 'lon', 'SA code'])
centroid_sa=pd.read_csv('sa_centroids.csv')


In [97]:

#knn sur les centroid pour trouver le plus proche voisin des lat,lon orphelin de lsoa/sa code
nn_sa = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_sa.fit(centroid_sa[['lat', 'lon']])

nn_lsoa = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_lsoa.fit(centroid_lsoa[['lat', 'lon']])

lsoa_codes = []
distance_lsoa = []
sa_codes = []
distance_sa = []


In [98]:

coordinates = missing_lsoa_df3[['Latitude', 'Longitude']].values

# Get nearest neighbors for LSOA
distances_lsoa, indices_lsoa = nn_lsoa.kneighbors(coordinates)

# Get nearest neighbors for SA
distances_sa, indices_sa = nn_sa.kneighbors(coordinates)

# Extract the corresponding LSOAcodes and SA codes
missing_lsoa_df3['lsoa_code_match'] = centroid_lsoa.iloc[indices_lsoa.flatten()]['LSOAcode'].values
missing_lsoa_df3['distance_lsoa_match'] = distances_lsoa.flatten()

missing_lsoa_df3['sa_code_match'] = centroid_sa.iloc[indices_sa.flatten()]['SA code'].values
missing_lsoa_df3['distance_sa_match'] = distances_sa.flatten()


threshold = 0.01  #~1km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3['lsoa_code_match'] = centroid_lsoa.iloc[indices_lsoa.flatten()]['LSOAcode'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3['distance_lsoa_match'] = distances_lsoa.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3['sa_code_match'] = ce

In [99]:
missing_lsoa_df3.head()

Unnamed: 0,CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Crimetype,Lastoutcomecategory,Context,anneemois,infogeo,id,lsoa_code_match,distance_lsoa_match,sa_code_match,distance_sa_match
21331,,2019-11,British Transport Police,-3.50197,55.9188,On or near Uphall (Station),,,Bicycle theft,,,2019-11,British Transport Police,21332,E01019107,1.049509,N00000217,2.389558
21332,,2019-11,British Transport Police,-2.72987,56.0578,On or near North Berwick (Station),,,Bicycle theft,,,2019-11,British Transport Police,21333,E01027382,0.695959,N00000217,3.133531
21333,,2019-11,British Transport Police,-4.57757,55.9476,On or near Dalreoch (Station),,,Bicycle theft,,,2019-11,British Transport Police,21334,E01019126,1.607198,N00003140,1.596375
21334,,2019-11,British Transport Police,-3.63592,55.9003,On or near Bathgate (Station),,,Bicycle theft,,,2019-11,British Transport Police,21335,E01019126,1.066315,N00000217,2.266318
21335,,2019-11,British Transport Police,-0.020213,53.9585,On or near York (Station),,,Bicycle theft,,,2019-11,British Transport Police,21336,E01013050,0.152368,N00000319,5.443719


In [100]:
len(missing_lsoa_df3)

22402

In [101]:
missing_lsoa_df3.to_csv('missing_lsoa_df3.csv')

In [102]:

# Update the LSOAcode based on the smallest distance
missing_lsoa_df3['LSOAcode'] = np.select(
    [
        (missing_lsoa_df3['distance_lsoa_match'] < threshold) & 
        (missing_lsoa_df3['distance_lsoa_match'] < missing_lsoa_df3['distance_sa_match']),
        (missing_lsoa_df3['distance_sa_match'] < threshold)
    ],
    [
        missing_lsoa_df3['lsoa_code_match'],
        missing_lsoa_df3['sa_code_match']
    ],
    default=missing_lsoa_df3['LSOAcode']
)

missing_lsoa_df3.drop(columns=missing_lsoa_df3.columns[-4:], inplace=True)


missing_lsoa_df3['original_index'] = missing_lsoa_df3.index
missing_lsoa_df3 = missing_lsoa_df3.merge(lsoa_data, left_on='LSOAcode', right_on='LSOA11CD', how='left')
missing_lsoa_df3['LSOAname'] = missing_lsoa_df3['LSOAname'].fillna(missing_lsoa_df3['LSOA11NM'])
missing_lsoa_df3 = missing_lsoa_df3.drop(['LSOA11CD', 'LSOA11NM'], axis=1)

missing_lsoa_df3 = missing_lsoa_df3.merge(sa_data, left_on='LSOAcode', right_on='SA2011', how='left')
missing_lsoa_df3['LSOAname'] = missing_lsoa_df3['LSOAname'].fillna(missing_lsoa_df3['SA2011NAME'])
missing_lsoa_df3 = missing_lsoa_df3.drop(['SA2011', 'SA2011NAME'], axis=1)

missing_lsoa_df3 = missing_lsoa_df3[missing_lsoa_df3['LSOAcode']!='']
missing_lsoa_df3.set_index('original_index', inplace=True)

df3.update(missing_lsoa_df3)
print(df3.head())






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3['LSOAcode'] = np.select(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3.drop(columns=missing_lsoa_df3.columns[-4:], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df3['original_index'] = missing_lsoa_df3.index


                                             CrimeID    Month  \
0  1d12d73bb8f1eb0a71c7fd43b0430f3b78ca41ce5e4f90...  2019-11   
1  3742b78ffb3ef54ae0d24d3b366504636e40bd64c36ec1...  2019-11   
2  b4dfed0c7a05192ba7af98ba00bf03551a98c5956d21b6...  2019-11   
3  c697db9e6b7682d089afcb4397488d561e5e188342df3a...  2019-11   
4  14feaa58bcc76c85a71dbe50097655a5aa666cae339aa0...  2019-11   

                       Reportedby  Longitude  Latitude  \
0  Avon and Somerset Constabulary   -3.18786   54.9482   
1  Avon and Somerset Constabulary   -2.50913   51.4161   
2  Avon and Somerset Constabulary   -2.50913   51.4161   
3  Avon and Somerset Constabulary   -2.51176   51.4100   
4  Avon and Somerset Constabulary   -2.50938   51.4096   

                      Location   LSOAcode                           LSOAname  \
0        On or near Field View  E01019107                     Allerdale 002B   
1   On or near St Francis Road  E01014399  Bath and North East Somerset 001A   
2   On or near St Fr

In [103]:
len(missing_lsoa_df3)

22402

In [104]:

# =============================================================================
# step 4 check if row with lsoa but no latitude do exist in df3
# =============================================================================
rows_with_lsoa_no_latitude = df3[df3['LSOAcode'].notnull() & df3['Latitude'].isnull()]

# Display the result
print(rows_with_lsoa_no_latitude)


# =============================================================================
# step 5 take row with latitude and no lsoa and fill lsoa if possible in df2 (repeat step 3)
# =============================================================================


missing_lsoa_df2 = df2[df2['Latitude'].notnull()]
# =============================================================================
# missing_lsoa_df2['LSOAcode']=None
# missing_lsoa_df2['LSOAname']=None
# =============================================================================
lsoa_codes = []
distance_lsoa = []
sa_codes = []
distance_sa = []

coordinates = missing_lsoa_df2[['Latitude', 'Longitude']].values

# Get nearest neighbors for LSOA
distances_lsoa, indices_lsoa = nn_lsoa.kneighbors(coordinates)

# Get nearest neighbors for SA
distances_sa, indices_sa = nn_sa.kneighbors(coordinates)

# Extract the corresponding LSOAcodes and SA codes
missing_lsoa_df2['lsoa_code_match'] = centroid_lsoa.iloc[indices_lsoa.flatten()]['LSOAcode'].values
missing_lsoa_df2['distance_lsoa_match'] = distances_lsoa.flatten()

missing_lsoa_df2['sa_code_match'] = centroid_sa.iloc[indices_sa.flatten()]['SA code'].values
missing_lsoa_df2['distance_sa_match'] = distances_sa.flatten()

threshold = 0.05  #~1km


# Update the LSOAcode based on the smallest distance
missing_lsoa_df2['LSOAcode'] = np.select(
    [
        (missing_lsoa_df2['distance_lsoa_match'] < threshold) & 
        (missing_lsoa_df2['distance_lsoa_match'] < missing_lsoa_df2['distance_sa_match']),
        (missing_lsoa_df2['distance_sa_match'] < threshold)
    ],
    [
        missing_lsoa_df2['lsoa_code_match'],
        missing_lsoa_df2['sa_code_match']
    ],
    default=missing_lsoa_df2['LSOAcode']
)

missing_lsoa_df2.drop(columns=missing_lsoa_df2.columns[-4:], inplace=True)

missing_lsoa_df2['original_index'] = missing_lsoa_df2.index
missing_lsoa_df2 = missing_lsoa_df2.merge(lsoa_data, left_on='LSOAcode', right_on='LSOA11CD', how='left')
missing_lsoa_df2['LSOAname'] = missing_lsoa_df2['LSOAname'].fillna(missing_lsoa_df2['LSOA11NM'])
missing_lsoa_df2 = missing_lsoa_df2.drop(['LSOA11CD', 'LSOA11NM'], axis=1)

missing_lsoa_df2 = missing_lsoa_df2.merge(sa_data, left_on='LSOAcode', right_on='SA2011', how='left')
missing_lsoa_df2['LSOAname'] = missing_lsoa_df2['LSOAname'].fillna(missing_lsoa_df2['SA2011NAME'])
missing_lsoa_df2 = missing_lsoa_df2.drop(['SA2011', 'SA2011NAME'], axis=1)

missing_lsoa_df2 = missing_lsoa_df2[missing_lsoa_df2['LSOAcode'].notnull()]
missing_lsoa_df2.set_index('original_index', inplace=True)


# df2['LSOAcode']=None
# df2['LSOAname']=None
df2.update(missing_lsoa_df2)
print(df2.head())











Empty DataFrame
Columns: [CrimeID, Month, Reportedby, Longitude, Latitude, Location, LSOAcode, LSOAname, Crimetype, Lastoutcomecategory, Context, anneemois, infogeo, id]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df2['lsoa_code_match'] = centroid_lsoa.iloc[indices_lsoa.flatten()]['LSOAcode'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df2['distance_lsoa_match'] = distances_lsoa.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_lsoa_df2['sa_code_match'] = ce

             Type                Date  Partofapolicingoperation  \
0   Person search 2019-11-01 02:21:00                       NaN   
1   Person search 2019-11-01 03:40:00                       NaN   
2  Vehicle search 2019-11-01 05:30:00                       NaN   
3   Person search 2019-11-01 08:40:00                       NaN   
4   Person search 2019-11-01 09:30:00                       NaN   

  Policingoperation  Latitude  Longitude Gender Agerange  \
0              None   51.1473   -2.71787   Male    25-34   
1              None   51.2327   -2.98853   Male    10-17   
2              None   51.4500   -2.59708   None     None   
3              None   51.1327   -2.99507   None     None   
4              None   50.9439   -2.61065   Male  over 34   

                                Selfdefinedethnicity Officerdefinedethnicity  \
0                 White - Any other White background                   White   
1  White - English/Welsh/Scottish/Northern Irish/...                   White

In [105]:
df1.columns

Index(['CrimeID', 'Month', 'Reportedby', 'Longitude', 'Latitude', 'Location',
       'LSOAcode', 'LSOAname', 'Outcometype', 'anneemois', 'infogeo', 'id'],
      dtype='object')

In [47]:
fields_outcomes_temp=['Longitude', 'Latitude', 'LSOAcode', 'LSOAname']
fields_stopandsearch_temps=['Longitude', 'Latitude', 'LSOAcode', 'LSOAname']
fields_street_temp=['Longitude', 'Latitude', 'LSOAcode', 'LSOAname']


In [48]:
len(missing_lsoa_df3)

22402

In [114]:
engine=connect_maria(db_name)

In [115]:
conn=engine.connect()

In [109]:
df1.to_sql('table_travail',conn,if_exists='replace',index=False)
     

405433

In [110]:
df

Index(['CrimeID', 'Month', 'Reportedby', 'Longitude', 'Latitude', 'Location',
       'LSOAcode', 'LSOAname', 'Outcometype', 'anneemois', 'infogeo', 'id'],
      dtype='object')

In [112]:
                         
insert_query = """
    INSERT INTO outcomes_temp (CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Outcometype,anneemois,infogeo,id)
    SELECT CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Outcometype,anneemois,infogeo,id
    FROM table_travail
    ON DUPLICATE KEY UPDATE 
        LSOAcode = VALUES(LSOAcode),
        LSOAname = VALUES(LSOAname);
    """

with engine.connect() as conn:
    conn.execute(text(insert_query))
    conn.commit()




In [117]:
df3.columns

Index(['CrimeID', 'Month', 'Reportedby', 'Longitude', 'Latitude', 'Location',
       'LSOAcode', 'LSOAname', 'Crimetype', 'Lastoutcomecategory', 'Context',
       'anneemois', 'infogeo', 'id'],
      dtype='object')

In [118]:
df3.to_sql('table_travail',conn,if_exists='replace',index=False)

937274

In [119]:
insert_query = """
    INSERT INTO street_temp (CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Crimetype,Lastoutcomecategory,Context,anneemois,infogeo,id)
    SELECT CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Crimetype,Lastoutcomecategory,Context,anneemois,infogeo,id
    FROM table_travail
    ON DUPLICATE KEY UPDATE 
        LSOAcode = VALUES(LSOAcode),
        LSOAname = VALUES(LSOAname);
    """

with engine.connect() as conn:
    conn.execute(text(insert_query))
    conn.commit()


OperationalError: (pymysql.err.OperationalError) (1205, 'Lock wait timeout exceeded; try restarting transaction')
[SQL: 
    INSERT INTO street_temp (CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Crimetype,Lastoutcomecategory,Context,anneemois,infogeo,id)
    SELECT CrimeID,Month,Reportedby,Longitude,Latitude,Location,LSOAcode,LSOAname,Crimetype,Lastoutcomecategory,Context,anneemois,infogeo,id
    FROM table_travail
    ON DUPLICATE KEY UPDATE 
        LSOAcode = VALUES(LSOAcode),
        LSOAname = VALUES(LSOAname);
    ]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [116]:
update_table_with_df('street_temp', fields_outcomes_temp, missing_lsoa_df3, fields_outcomes_temp, engine,conn ,'id', 'id')

ProgrammingError: (pymysql.err.ProgrammingError) nan can not be used with MySQL
[SQL: UPDATE street_temp SET `Longitude`=%(Longitude)s, `Latitude`=%(Latitude)s, `LSOAcode`=%(LSOAcode)s, `LSOAname`=%(LSOAname)s WHERE street_temp.id = %(id_1)s]
[parameters: {'Longitude': -3.50197, 'Latitude': 55.9188, 'LSOAcode': None, 'LSOAname': nan, 'id_1': 21332}]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
conn.close()

In [None]:
conn.close()