In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [None]:
plt.style.use('dark_background') 

def highlight_odd_rows(s):
    '''
    Changes style of a dataframe visualization
    Args:
       s : The row of the dataframe
    Returns:
    '''
    styles = []
    for i in range(len(s)):
        if i % 2 == 1:
            styles.append('background-color: indigo; border: 1px solid white;')
        else:
            styles.append('border: 1px solid white;')
    return styles

In [None]:
df = pd.read_csv("regularizacion100000.csv", encoding = 'utf-8')

In [None]:
df.head().style.apply(highlight_odd_rows)

In [None]:
def getDfSize(df):
    rows = len(df.axes[0])
    columns = len(df.axes[1])

    return {'rows': rows, 'columns': columns}

In [None]:
getDfSize(df)

<h3>La siguiente función extrae las distintas llaves de cada diccionario que contenga una columna como valor en el dataframe</h3>

In [None]:
def determineDistinctKeys(columnToList): #Recibe una lista, no una dataframe column
    '''
    Extracts the keys of a dictionary list and saves it in a dictionary with its count of appearences in the list
    Args:
       columnList (list) : List of dictionary to extract the keys
    Returns:
       distinctKeys (dictionary): A dictionary with the keys as a string and its appearences count in the list
    '''
    distinctKeysDict = {}
    distinctKeysList = []
    for dictionary in columnToList:
        dictionary = json.loads(dictionary)
        orderedKeys = sorted(list(dictionary.keys()))
        if orderedKeys in distinctKeysList:
            distinctKeysDict['/'.join(orderedKeys)] += 1
        else:
            distinctKeysList.append(orderedKeys)
            distinctKeysDict['/'.join(orderedKeys)] = 0
    return distinctKeysDict

<h3>La siguiente función extrae los distintos valores de una columna en el dataframe</h3>

In [None]:
def determineDistinctValues(df, col):
    '''
    Determines the unrepeated values in a columns
    Args:
       df (DataFrame) : Dataframe which contains the column to search
       col (string) : Column's name to search
    Returns:
       df[col].unique() (array): An array (iterable object) with the unrepeated values of the column
    '''
    return df[col].unique()

<h3>Con esto sabemos que la columna "Response Body" tiene un cuerpo idéntico para todos los casos</h3>

In [None]:
responseBody = df['responseBody'].values.tolist()

In [None]:
determineDistinctKeys(responseBody)

<h3>Con esto sabemos la proporción entre consultas con algún tipo de error, o consultas normales en Carfax  USA</h3>

In [None]:
carfaxUsaData = df['carfaxUsaData'].values.tolist()

In [None]:
determineDistinctKeys(carfaxUsaData)

<h3>Finalmente sabemos qué se pide en los "Request Parameters"</h3>

In [None]:
requestParameters = df['requestParameters'].values.tolist()

In [None]:
determineDistinctKeys(requestParameters)

<h3>Ahora nos aseguramos de todos los valores distintos que haya en las columnas</h3>

In [None]:
determineDistinctValues(df, 'apiKey') #Solo existe un valor posible para este campo

In [None]:
len(list(determineDistinctValues(df, 'ip')))

In [None]:
#vinsByIp = list(df.groupby('ip')['VIN'].apply(list))
#ips = list(set(df['ip']))
#if len(ips) == len(vinsByIp):
 #   ipsVinsFrame = []
 #   for ipsIndex in range(len(ips)):
  #      ipsVinsFrame.append([ips[ipsIndex], len(set(vinsByIp[ipsIndex]))])
   # ipsVinsFrame = pd.DataFrame(ipsVinsFrame,columns = ['ip', 'count'])
ips_vins_frame = (
    df.groupby('ip')['VIN']
    .nunique()  # Count unique 'VIN's in each group
    .reset_index()  # Convert the series back to a DataFrame
    .rename(columns={'VIN': 'count'})  # Rename the column for clarity
)

In [None]:
ips_vins_frame[ips_vins_frame['count']>3].describe()

In [None]:
determineDistinctValues(df, 'userId') #Solo existe un valor posible para este campo

In [None]:
list(determineDistinctValues(df, 'idReporte')); #Existen varios valores (¿de qué depende esta diferencia?)

In [None]:
determineDistinctValues(df, 'responseCode') #Solo existe un valor posible para este campo

In [None]:
determineDistinctValues(df, 'responseCodeStatus') #Solo existe un valor posible para este campo

In [None]:
determineDistinctValues(df, 'labels') #Solo existe un valor posible para este campo

In [None]:
determineDistinctValues(df, 'firewallUsa') #Solo existe un valor posible para este campo

<h3>A continuación busco los distintos tipos de código de alerta que se encuentran en el dataframe creando una función especial para ello </h3>

In [None]:
def countAlertCodes(df, alertsColumn: str):
    '''
    Counts the alert codes in a column of a dataframe
    Args:
       df (DataFrame) : Dataframe which contains the column to search
       alertsColumn (string) : Column's name to search
    Returns:
       codeCount (dictionary): A dictionary with the distinct alert codes as keys and the appearence count in the column
    '''
    alertsWithCode = filter(lambda x: x != [],df['alertas'].values.tolist())
    codeCount = {'Code 1': 0 , 'Code 2': 0, 'Code 3': 0, 'Code 4': 0, 'Code 5': 0, 'Code 6': 0, 'Code 7': 0}

    for alert in alertsWithCode:
        alertList = json.loads(alert)
        if alertList != []:
            for subalert in alertList:
                n = subalert['codigo']
                codeCount[''.join(['Code ', str(n)])] += 1

    return codeCount 


In [None]:
countAlertCodes(df, 'alertas')

<h3>Ahora calculo el tiempo en que se tarda cada petición en generar una respuesta</h3>

In [None]:
def calculateDeltaTime(df, columnA, columnB):
    '''
    Converts columnA and columnB in datetime type and makes the difference between both columns
    Args:
       df (DataFrame) : Dataframe which contains the columns to substraction
       columnA (string) : Minuend column
       columnB (string): Substrahend column
    Returns:
       responsePeriod (DataFrame): Returns a column of the seconds difference as a dataframe
    '''
    responseTimeType = pd.to_datetime(df[columnA])
    requestTimeType = pd.to_datetime(df[columnB])
    responsePeriod = (responseTimeType - requestTimeType).to_frame()
    responsePeriod['secondsDifference'] = pd.to_timedelta(responsePeriod[0]).dt.total_seconds()
    responsePeriod.drop([0], axis='columns', inplace=True)

    return responsePeriod

<h4>Con mayor frecuencia parece tardarse un segundo la respuesta. La gráfica parece ajustarse a una distribución de Poisson o a una distribución normal. Checar cuál es la más conveniente para calcular probabilidades</h4>

In [None]:
timeFrame = calculateDeltaTime(df, 'responseTime', 'requestTime')
plt.hist(timeFrame[timeFrame['secondsDifference']<6]['secondsDifference'], bins = 100, edgecolor = 'black')
plt.xlabel('Time in Seconds')
plt.ylabel('Frequency')
plt.title('Histogram of Time in Seconds')
plt.show()

In [None]:
plt.hist(timeFrame[timeFrame['secondsDifference']>6]['secondsDifference'], bins = 100, edgecolor = 'black')
plt.xlabel('Time in Seconds')
plt.ylabel('Frequency')
plt.title('Histogram of Time in Seconds')
plt.show()

In [None]:
timeFrame.describe().style.apply(highlight_odd_rows)

In [None]:
timeFrame[timeFrame['secondsDifference'] <= 6].describe().style.apply(highlight_odd_rows)

In [None]:
timeFrame[timeFrame['secondsDifference'] > 6].describe().style.apply(highlight_odd_rows)

In [None]:
timeFrame.max().values.tolist()

<h3>A continuación se comprueba que cada diccionario del valor de la columna 'carfaxUsaData' con una llave "error" le corresponda un valor distinto a lista vacía en la columna 'alertas'</h3>

In [None]:
def proveErrorAlert(df):
    '''
    Assures every error in CarfaxUsaData is associated with an alert
    Args:
       df (DataFrame) : Dataframe which contains the columns to substraction
    Returns:
       ... (string): Returns a status message
    '''
    carfaxUsaData = df['carfaxUsaData'].values.tolist()
    conError = 0
    for index1 in range(len(carfaxUsaData)):
        carfaxDict = json.loads(carfaxUsaData[index1])
        if 'error' in list(carfaxDict.keys()):
            conError += 1
            if df.iloc[index1]['alertas'] == []:
                return 'Existe un valor con error y sin alerta'
    return 'Funciona correctamente'

In [None]:
proveErrorAlert(df)

<h3>Ahora se estudiará la relación con los Vin's repetidos y las alertas</h3>

In [None]:
def getRepeatedValuesInAColumn(df, columnName):
    '''
    Identifies the repeated values of a column in a dataframe
    Args:
       df (DataFrame) : Dataframe which contains the columns to search
       columnName (string) : Column's name to search
    Returns:
       duplicates['VIN'] (pandas Series): Returns a column as a pandas series of the duplicated values of the columnName
    '''
    duplicates = df[df.duplicated(subset=[columnName], keep=False)]
    return duplicates['VIN']

In [None]:
def searchValuesInADataframe(df, values:list, columnA: str, columnB: str)-> dict:
    '''
    Optimized version to search values in a dataframe columnA and retrieve the corresponding values of columnB.
    Args:
       values (list): The list of values to be searched.
       columnA (str): The name of the column to search.
       columnB (str): The name of the column to retrieve values from.
       df (DataFrame): The dataframe to search in.
    Returns:
      grouped: A dictionary with keys as the values from columnA and values as the list of corresponding entries from columnB.
    '''

    filtered_df = df[df[columnA].isin(values)]
    
    grouped = filtered_df.groupby(columnA)[columnB].apply(list).to_dict()
    
    return grouped

In [None]:
proofRepeated = getRepeatedValuesInAColumn(df, 'VIN').unique().tolist()

In [None]:
repeatedValues = searchValuesInADataframe(df, proofRepeated, 'VIN', 'alertas')

<h4>Identificamos si existe algún VIN repetido que contenga y a la vez no, mensajes de alerta. Además se identifica la cantidad de veces que se repiten los VIN's que lanzan alerta contra los que no lanzan ninguna alerta</h4>

In [None]:
def getIncorrectRepeatedVinInformation(df)->dict:
    '''
    Searches the values of a list in a dataframe columnA and retrieves the information of columnB
    Args:
       df (DataFrame): The dataframe where where will be searched
    Returns:
      ... (dict): A dictionary which contains a status message, a list of good and bad VINs withits appeareance count, and the total count of good and bad VINS
    '''
    vinValues = getRepeatedValuesInAColumn(df, 'VIN').unique().tolist()
    repeatedValues = searchValuesInADataframe(df, vinValues, 'VIN', 'alertas')
    goodVins = 0
    badVins = 0
    goodVinsList = []
    badVinsList = []
    messages = []
    for repeatedVin in list(repeatedValues.keys()):
        differentValuesByVin = list(set(repeatedValues[repeatedVin]))
        for differentValue in differentValuesByVin:
            if differentValue == '[]':
                goodVins += 1
                if len(differentValuesByVin)>1:
                    messages.append(''.join(['Alerta con vin: ',repeatedVin, ' , tiene y no tiene alertas.']))
                else:
                    goodVinsList.append([repeatedVin, len(repeatedValues[repeatedVin])])
            else:
                badVins += 1
                badVinsList.append([repeatedVin, len(repeatedValues[repeatedVin])])
            
    messages.append(''.join(['VINs repetidos que no lanzan alerta: ', str(goodVins), '\nVINs repetidos que sí lanzan alerta: ', str(badVins)]))
    return {'message': messages,
           'goodVinsList': goodVinsList,
           'badVinsList': badVinsList,
           'goodVins':goodVins,
           'badVins': badVins}

In [None]:
incorrectInformation = getIncorrectRepeatedVinInformation(df)

In [None]:
incorrectInformation['message']

In [None]:
def diagnoseVins(df, incorrectInformation):
    '''
    Converts the lists of  getIncorrectRepeatedVinInformation() function in dataframes
    Args:
       df (DataFrame): The dataframe where where will be searched
    Returns:
      ... (dict): A dictionary which contains the good and bad VINs dataframes and the good and bad repeated Records of the VINs
    '''
    goodVinsFrame = pd.DataFrame(incorrectInformation['goodVinsList'], columns = ['VIN', 'repeatedTimes'])
    goodRepeatedRecords = goodVinsFrame.sum()['repeatedTimes']
    badVinsFrame = pd.DataFrame(incorrectInformation['badVinsList'], columns = ['VIN', 'repeatedTimes'])
    badRepeatedRecords = badVinsFrame.sum()['repeatedTimes']

    return {'goodVinsFrame': goodVinsFrame, 'badVinsFrame': badVinsFrame, 'goodRepeatedRecords': goodRepeatedRecords, 'badRepeatedRecords': badRepeatedRecords}

In [None]:
diagnosedVins = diagnoseVins(df, incorrectInformation)

In [None]:
diagnosedVins['goodVinsFrame'].describe().style.apply(highlight_odd_rows)

In [None]:
plt.hist(diagnosedVins['goodVinsFrame']['repeatedTimes'], bins = 50, edgecolor = 'black')
plt.xlabel('Repeated Times')
plt.ylabel('Frequency')
plt.title('Histogram of Repeated Times by VIN (without alerts)')
plt.show()

In [None]:
diagnosedVins['badVinsFrame'].describe().style.apply(highlight_odd_rows)

In [None]:
plt.hist(diagnosedVins['badVinsFrame']['repeatedTimes'], bins = 50, edgecolor = 'black')
plt.xlabel('Repeated Times')
plt.ylabel('Frequency')
plt.title('Histogram of Repeated Times by VIN (with alerts)')
plt.show()

<h4>Finalmente tenemos los VINs revisados en todos los registros. La proporción entre VINs repetidos con alerta, VINs repetidos sin alerta. Máximos y mínimos repeticiones</h4>

In [None]:
print('Cantidad de registros repetidos: ', diagnosedVins['goodRepeatedRecords'] + diagnosedVins['badRepeatedRecords'])
print('Cantidad de registros repetidos con VINs sin alerta: ', diagnosedVins['goodRepeatedRecords'])
print('Cantidad de registros repetidos con VINs con alerta: ', diagnosedVins['badRepeatedRecords'])
print('Cantidad de VINs revisados en esos registros repetidos: ', list(diagnosedVins['badVinsFrame'].count() + diagnosedVins['goodVinsFrame'].count())[1])
repetitionProportion = [diagnosedVins['goodRepeatedRecords'], diagnosedVins['badRepeatedRecords']]
labelsRepetition = ['VINs without alerts', 'VINs with alerts']
plt.pie(repetitionProportion, labels=labelsRepetition, autopct="%0.1f %%")
plt.axis("equal")
plt.title('Repeated records')
plt.show()

<h4>Estadísticas de las repeticiones de los VINs sin alerta</h4>

In [None]:
diagnosedVins['goodVinsFrame'].describe().style.apply(highlight_odd_rows)

<h4>Estadísticas de las repeticiones de los VINs con alerta</h4>

In [None]:
diagnosedVins['badVinsFrame'].describe().style.apply(highlight_odd_rows)

<h3>Ahora se comprueba que para cada VIN repetido se dió exactamente la misma información, y en caso de tener alerta, también se dieron las mismas alertas</h3>

In [None]:
def verifyInfoRepeatedVins(df):
    '''
    Verifies every repeated VIN has the same information in every record (due to the short period time reviewed)
    Args:
       df (DataFrame): The dataframe where where will be searched
    Returns:
      ... (dict): A dictionary which contains the different responses as a dict for every repeated VIN in uniqueResponses and VINs with more than one response in differentResponse key
    '''
    uniqueResponses = {}
    differentResponses = {}
    vinValues = getRepeatedValuesInAColumn(df, 'VIN').unique().tolist()
    repeatedValues = searchValuesInADataframe(df, vinValues, 'VIN', 'responseBody')
    repeatedVins = repeatedValues.keys()
    for vin in repeatedVins:
        uniqueResponses[vin] = []
        listJson = []
        for responseString in repeatedValues[vin]:
            responseJson = json.loads(responseString)
            listJson.append(responseJson)
        repeatedValues[vin] = listJson
    for vin in repeatedVins:
        for responseJson in repeatedValues[vin]:
            keys = list(responseJson.keys())
            if 'fabricante' in keys and 'paisOrigen' in keys:
                info = {'anioModelo': responseJson['anioModelo'], 'fabricante': responseJson['fabricante'], 'marca': responseJson['marca'], 'modelo': responseJson['modelo'], 'paisOrigen': responseJson['paisOrigen'], 'robo': responseJson['robo'], 'roboFecha': responseJson['roboFecha'], 'codes': []}
            else:
                 info = {'anioModelo': responseJson['anioModelo'], 'marca': responseJson['marca'], 'modelo': responseJson['modelo'], 'robo': responseJson['robo'], 'roboFecha': responseJson['roboFecha'], 'codes': []}
            if  responseJson['mensajes'] != []:
                for message in responseJson['mensajes']:
                    info['codes'].append(message['codigo'])
                    info['codes'] = list(set(info['codes']))
                if len(info['codes'])>1:
                    info['codes'] = info['codes'].sort()
            if uniqueResponses[vin] == []:
                 uniqueResponses[vin].append(info)
            else:
                if info not in uniqueResponses[vin]:
                    uniqueResponses[vin].append(info)
                    #info['tiempoRespuesta'] = responseJson['tiempoRespuesta']
                    differentResponses[vin] = info
    return {'uniqueResponses': uniqueResponses, 'differentResponses': differentResponses}

In [None]:
verifyResponses = verifyInfoRepeatedVins(df)
if verifyResponses['differentResponses'] != {}:
    print('Existen respuestas distintas para un mismo VIN')

In [None]:
differentResponseVins = list(verifyResponses['differentResponses'].keys())

In [None]:
print(differentResponseVins)

In [None]:
for response in differentResponseVins:
    print('VIN', response)
    for information in verifyResponses['uniqueResponses'][response]:
        print(information)
    print('------------')

In [None]:
# Cálculo de la probabilidad usando la fórmula previamente descrita
prob_menor_o_igual_a_6 = 0.99725
prob_ninguno_mayor_a_6 = prob_menor_o_igual_a_6 ** 100
prob_al_menos_uno_mayor_a_6 = 1 - prob_ninguno_mayor_a_6

print(prob_al_menos_uno_mayor_a_6)

In [None]:
print(prob_al_menos_uno_mayor_a_6**2)

In [None]:
#cutoff_date = pd.Timestamp('2024-11-18')
df['responseTime'] = pd.to_datetime(df['responseTime'])
#df = df[df['responseTime']< cutoff_date]

In [None]:
df['hour'] = df['responseTime'].dt.hour
df['dayOfWeek'] = df['responseTime'].dt.day_name()
df['date'] = df['responseTime'].dt.date

In [None]:
df.groupby('hour').size()

In [None]:
df.groupby('dayOfWeek').size()

In [None]:
df.groupby(by = ['dayOfWeek', 'hour']).nunique()

In [None]:
xd =df[df['dayOfWeek'] == 'Monday'].groupby(by = ['date', 'hour']).nunique().reset_index()

In [None]:
xd2 = xd[['date', 'hour', '_id']][xd['hour']==0]

In [None]:
xd2['_id'].mean()

In [None]:
xd2['_id'].std()

In [None]:
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
typical_records = {}
for day in days_of_week:
    groupedDataframe = df[df['dayOfWeek'] == day].groupby(by = ['date', 'hour']).nunique().reset_index()
    for hour in range(24):
        hourFrame = groupedDataframe[['date', 'hour', '_id']][groupedDataframe['hour']==hour]
        typical_records[f'{day}_{str(hour)}'] = {'mean': float(hourFrame['_id'].mean()), 'std': float(hourFrame['_id'].std())}

with open('meanRecordsByHour.json', "w", encoding='utf-8') as outfile:
        json.dump(typical_records, outfile)

#for dayHour in typical_records.keys():
    #print(dayHour)
    #print(typical_records[dayHour])
    #print('---------------------')