In [205]:
#importar algunas librerias que estaremos usando comunmente.
import numpy as np
import pandas as pd
from dateutil.parser import parse
import re

# pd.set_option('display.max_colwidth', None) # To show all row in a colum
pd.options.display.max_colwidth = 50


In [207]:
class data_analyst:
    def __init__(self , file_name):
        self.file_name = file_name # Que sera el atributo, conocido con el nombre del archivo
        self.data =  None #Al instanciarlo, le cargaremos los datos respectivos con la libreria Pandas
    
    #De aqui en adelante son los metods
    
    def open_file(self):
        
        """
        Function to open files directly with the name, to reduce cost on our code
        """
        try:
            if self.file_name.endswith('.csv'):
                self.data = pd.read_csv(self.file_name)
                #print(f"Your .csv file was opened sucessfully")
            elif self.file_name.endswith('.xls') or self.file_name.endswith('.xlsx'):
                self.data = pd.read_excel(self.file_name)
                #print(f'Your .xls / xlsx was opened sucessfully')
            else:
                print('This file is not valid, please try with one with the ext csv / xls / xlsx ')
                
        except Exception as e:
            print(f"There is a error trying to open the file, error-code: {e}")
    
    def show_rows(self, rows = 10):
        """
        Funcion para mostrar un cabezal, o una cantidad de filas, por defecto esta en 10
        Args:
            rows (int, optional): _description_. Defaults to 10.
        """
        
        #iniciamos con una validacion de que nuestros datos tienen un valor asignado, dado que None es la representacion de la ausencia de valor.
        if self.data is not None:
            #Retornamos la impresion de las filas, con n valor, que en caos de que no se modifique rows = 10
            return self.data.head(rows)
        
        else:
            print(f'There is not a file upload yet')
    
    def info(self):
        """Funcion para retornar la funcion info de pandas, con el data_set que cargamos"""
        if self.data is not None:
            return self.data.info()
        
        else:
            print(f"There is not file upload yet")
    
    def describe(self):
        
        """Funcion para retornar una descripcion de pandas"""
        if self.data is not None:
            return self.data.describe()
        
        else:
            print("There is not file upload yet")
    
    def show_columns_names(self):
        """ Muestra y retorna el nombre de las columnas, las retorna en una lista-"""
        cols = [] # Se crea un lista para guardar el nombre de las columnas, la retornaremos en caso de querer usarla
        if self.data is not None:
            print("There is the name of the columns")
            for col in self.data.columns:
                print(col)
                cols.append(col)
        else:
            print('No hay columas en tu data_frame')
        
        return cols
    
    def rename_columns(self, actual_name, new_name):
        ''' Actual Name - New Name, String Format'''
        #Comenzamos validando la existencia de datos en nuestro data frame
        if self.data is not None:
            
            if actual_name in self.data.columns:
                self.data.rename(columns={actual_name: new_name}, inplace = True)
                print(f'The column "{actual_name}" was renamed as "{new_name}"')
            else:
                print(f'There is not a colum named as "{actual_name}"')
        else:
            print(f"There is file/ dataframe upload yet")
    
    def unique_values(self):
        
        if self.data is not None:  
            # Dictionary Comprehension para obtener el conteo de valores únicos en cada columna.
            # - 'col' es el nombre de la columna actual en 'self.data.columns'.
            # - La clave del diccionario es 'col'.
            # - El valor es el número de elementos únicos en la columna, obtenido con 'self.data[col].nunique()'.
            #usare un dict comprehencion, para practicar esta estructura
            unique_values = {col : self.data[col].nunique() for col in self.data.columns}
            
            # Iterando sobre el diccionario 'unique_values'.
            # - 'col' representa el nombre de la columna actual.
            # - 'count' representa el número de valores únicos de esa columna.
            # Utilizamos f-strings para imprimir de manera ordenada la columna y su número de valores únicos.
            for col, count in unique_values.items():
                print(f"Columna: {col} -> Valores Únicos: {count}")
        else:
            print(f'There is not file upload yet') 
    
    def split_with_character(self, character, colum_name, drop_old=True):
        if self.data is not None:
            # Dividing based on the provided character. This will give the parts before and after the character.
            #Apply Este método se utiliza para aplicar una función a lo largo del eje del DataFrame.
            # Una función lambda es una pequeña función anónima. Aquí, x representa un elemento individual (o valor de celda)
            #x.split(carácter, 1): Esto utiliza el método split() de Python para dividir la cadena x en la primera aparición del carácter
            #if character in  x: Comprueba si el carácter está presente en la cadena x, de no ser asi ser realiza toda la funcion lambda, de lo contrario se retorna tal cual se encuentra.
            self.data['Before_character'] = self.data[colum_name].apply(lambda x: x.split(character, 1)[0] if character in x else x) # Los indices [0] y [1] despues de character es si devuelven el antes o despues del caracter
            self.data['After_Character'] = self.data[colum_name].apply(lambda x: x.split(character, 1)[1] if character in x else None)

            if drop_old:
                # If we don't want to keep the old column (which is the default behavior), we simply drop it.
                self.data = self.data.drop(columns=[colum_name])

        return self.data
    
    def delete_column(self , column_name):
        """
        Deletes the specified column from the dataframe.
        
        Args:
        - column_name (str): The name of the column to be deleted.
        
        Returns:
        - DataFrame: The updated dataframe without the specified column.
        """
        if self.data is not None:
            if column_name in self.data.columns:
                self.data = self.data.drop(columns=[column_name])
            else:
                print(f"Column '{column_name}' not found in the dataframe.")
            return self.data
    
    def process_character(self, column_name, character, option=1):
        """
        Process a DataFrame column based on the presence of a specified character.
        
        Args:
        - column_name (str): The name of the column to process.
        - character (str): The character to process.
        - option (int): The processing option. 
            1: Delete everything before the character (default).
            2: Delete everything after the character.
            3: Only delete the character.
        
        Returns:
        - pd.DataFrame: The processed DataFrame.
        """
        if option == 1:
            self.data[column_name] = self.data[column_name].apply(lambda x: x.split(character, 1)[1] if x and character in x else x)
        elif option == 2:
            self.data[column_name] = self.data[column_name].apply(lambda x: x.split(character, 1)[0] if x and character in x else x)
        elif option == 3:
            self.data[column_name] = self.data[column_name].str.replace(character, '')
        return self.data

    def handle_null_values(self, column_name, strategy="delete", fill_value=None):
        """
        Handle null values in a DataFrame column.
        
        Args:
        - column_name (str): The name of the column to process.
        - strategy (str): The strategy to handle null values. Options:
            "delete" (default): Delete rows with null values in the specified column.
            "fill": Fill null values with a specified value.
        - fill_value (Any, optional): The value to fill null values with. Required if strategy="fill".
        
        Returns:
        - pd.DataFrame: The processed DataFrame.
        """
        if self.data is not None:
            if strategy == "delete":
                self.data = self.data.dropna(subset=[column_name])
            elif strategy == "fill":
                if fill_value is not None:
                    self.data[column_name].fillna(fill_value, inplace=True)
                else:
                    print("Please provide a fill_value.")
            else:
                print("Invalid strategy.")
            return self.data

    def remove_line_breaks(self, column_name):
        """
        Remove the line breaks of a specify column
        
        Args:
        -Column name (str) : Nombre de la columna, donde quiere que se eliminen los saltos de linea
        
        Returns:
        -pd.Dataframe: Dataframe con el texto actualizado sin los saltos de linea 
        
        
        
        """
        
        
        if self.data is not None:
            if column_name in self.data.columns:
                self.data[column_name] = self.data[column_name].replace('\n', ' ', regex=True)

            else:
                print(f'There is not column with the name "{column_name}"')
            
            return self.data
        else:
            print(f'There is not data load yet')
    
    def preprocess_text(self, text):
        if text is None:
            return None
        
        # Encuentra cualquier número seguido de un nombre de mes y separa con un espacio
        pattern = re.compile(r'(\d{1,2})(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)')
        return pattern.sub(r'\1 \2', str(text))
            
    def preprocess_column(self, column_name):
        if self.data is not None and column_name in self.data.columns:
            self.data[column_name] = self.data[column_name].apply(self.preprocess_text)
        else:
            print(f'Column "{column_name}" not found in the dataframe.')

    def extract_dates(self, column_name):
        if self.data is not None and column_name in self.data.columns:
            # Creamos una columna para las fechas con valores iniciales None
            self.data['Date'] = None

            # Utilizamos una expresión regular para identificar las fechas comunes en formato de texto
            date_pattern = re.compile(r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)\s+\d{1,2}\s+\d{4}\b|\b\d{1,2}\/\d{1,2}\/\d{2,4}\b')

            for index, row in self.data.iterrows():
                column_value = row[column_name]
                if column_value is None:
                    continue

                # Preprocesamos el texto antes de buscar las fechas
                processed_text = self.preprocess_text(column_value)
                dates = date_pattern.findall(processed_text)
                
                # Resto del código
                if dates:
                    try:
                        # Tomamos la primera fecha encontrada (puedes ajustar esto según tus necesidades)
                        date_str = dates[0]
                        date = parse(date_str)

                        # Asignamos la fecha a la nueva columna
                        self.data.at[index, 'Date'] = date

                        # Eliminamos la fecha del texto original
                        self.data.at[index, column_name] = row[column_name].replace(date_str, '').strip()
                    except Exception as e:
                        print(f"Error parsing date from text: {e}")
        else:
            print(f'Column "{column_name}" not found in the dataframe.')
            
            
    def filter_integer(self, column_name):
        """
        Esta función mantiene solo los valores que son enteros en la columna especificada.

        Args:
            column_name (str): El nombre de la columna donde se quiere aplicar el filtro.
        """
        
        if self.data is not None:
            # Utilizamos una función lambda para probar si cada valor puede convertirse en un entero
            # primero comvertimos a una cadena de texto a nuestra variable, obj y despues con una funcion build in comprobamos si es un digito
            is_integer = lambda x: str(x).isdigit()
            
            # Usar 'apply' en la columna de interés y obtener una Serie de booleanos
            integer_mask = self.data[column_name].apply(is_integer)

            # La filtración en pandas utilizando una Serie booleana funciona debido a cómo pandas maneja la indexación booleana. 
            # Cuando pasas una Serie de valores booleanos a un DataFrame, pandas devuelve solo las filas donde el valor correspondiente en la Serie booleana es True.
            # Filtrar el DataFrame usando la máscara booleana
            self.data = self.data[integer_mask]
        
            
        else:
            print(f'There is not file load yet')
            

In [208]:
# En primera forma lo instanciamos
Oppenhim_File = data_analyst("imdb_oppenhimmer_Uncleaned.csv") # Instanciacion de objeto open

Oppenhim_File.open_file()

Oppenhim_File.unique_values()

columns_names = Oppenhim_File.show_columns_names()

Columna: 9/10
Murphy is exceptional
Orlando_Gardner19 July 2023
You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in 

In [209]:
#Un nombre bastante grande para mi gusto
print(columns_names[0])


9/10
Murphy is exceptional
Orlando_Gardner19 July 2023
You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return 

In [210]:
Oppenhim_File.rename_columns(columns_names[0], "Qualifity")

The column "9/10
Murphy is exceptional
Orlando_Gardner19 July 2023
You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant 

In [211]:
Oppenhim_File.show_rows()


Unnamed: 0,Qualifity
0,"8/10\nA challenging watch to be sure, but a wo..."
1,10/10\nA brilliantly layered examination of a ...
2,10/10\nNolan delivers a powerfull biopic that ...
3,10/10\nA Masterpiece\nmohameddawoud-2601919 Ju...
4,10/10\nAnd the Oscar goes to...\nmark-217-3070...
5,10/10\nAnother Cinematic Masterpiece by Christ...
6,8/10\nThis movie is the bomb!\nzeki-420 July 2...
7,9/10\nExceptional storytelling and Genius Cina...
8,10/10\nFUTURE CLASSIC and EXTREMELY IMPORTANT ...
9,8/10\nIs it just me ?\npdean-4913221 July 2023...


In [212]:
#testing split fuction
Oppenhim_File.split_with_character('/' , 'Qualifity')

Unnamed: 0,Before_character,After_Character
0,8,"10\nA challenging watch to be sure, but a wort..."
1,10,10\nA brilliantly layered examination of a man...
2,10,10\nNolan delivers a powerfull biopic that sho...
3,10,10\nA Masterpiece\nmohameddawoud-2601919 July ...
4,10,10\nAnd the Oscar goes to...\nmark-217-3070331...
...,...,...
84043,7,"10\nHuge story, but production fails sometimes..."
84044,6,10\nTweaked\nfmacr25 July 2023\nI suppose thes...
84045,6,10\nJust a 6 for a near perfect film.\nm-nicol...
84046,6,10\nNo need to be 3 hours\nscottahaynie24 July...


In [213]:
new_columns_name = Oppenhim_File.show_columns_names()
Oppenhim_File.rename_columns(new_columns_name[0] , 'Qualification 0/10 ')
Oppenhim_File.rename_columns(new_columns_name[1] , 'Comments')

There is the name of the columns
Before_character
After_Character
The column "Before_character" was renamed as "Qualification 0/10 "
The column "After_Character" was renamed as "Comments"


In [215]:
Oppenhim_File.show_rows()

Unnamed: 0,Qualification 0/10,Comments
0,8,"10\nA challenging watch to be sure, but a wort..."
1,10,10\nA brilliantly layered examination of a man...
2,10,10\nNolan delivers a powerfull biopic that sho...
3,10,10\nA Masterpiece\nmohameddawoud-2601919 July ...
4,10,10\nAnd the Oscar goes to...\nmark-217-3070331...
5,10,10\nAnother Cinematic Masterpiece by Christoph...
6,8,10\nThis movie is the bomb!\nzeki-420 July 202...
7,9,10\nExceptional storytelling and Genius Ciname...
8,10,10\nFUTURE CLASSIC and EXTREMELY IMPORTANT MOV...
9,8,10\nIs it just me ?\npdean-4913221 July 2023\n...


dado que los caracters 10\n son saltos de linea y algo que dejamos despues de realizar una mala eliminacion

In [216]:
new_names_colums = Oppenhim_File.show_columns_names()

Oppenhim_File.process_character(new_names_colums[1], 'n', 1)

There is the name of the columns
Qualification 0/10 
Comments


Unnamed: 0,Qualification 0/10,Comments
0,8,"ging watch to be sure, but a worthwhile one.\n..."
1,10,tly layered examination of a man throughout al...
2,10,delivers a powerfull biopic that shows the da...
3,10,sider myself lucky to be alive to watch Christ...
4,10,d the Oscar goes to...\nmark-217-30703319 July...
...,...,...
84043,7,fails sometimes\nandreverschoor27 July 2023\n...
84044,6,'t explain these unrealistic reviews. After tw...
84045,6,ear perfect film.\nm-nicolaou-86-66125424 July...
84046,6,eed to be 3 hours\nscottahaynie24 July 2023\nA...


In [217]:
Oppenhim_File.show_rows(10)

Unnamed: 0,Qualification 0/10,Comments
0,8,"ging watch to be sure, but a worthwhile one.\n..."
1,10,tly layered examination of a man throughout al...
2,10,delivers a powerfull biopic that shows the da...
3,10,sider myself lucky to be alive to watch Christ...
4,10,d the Oscar goes to...\nmark-217-30703319 July...
5,10,other Cinematic Masterpiece by Christopher Nol...
6,8,", Christopher Nolan - probably the best blockb..."
7,9,al storytelling and Genius Cinametography\nZay...
8,10,d EXTREMELY IMPORTANT MOVIE FOR THE PRESENT DA...
9,8,-4913221 July 2023\nIs it just me or did anyon...


In [218]:
Oppenhim_File.remove_line_breaks(column_name='Comments')

Unnamed: 0,Qualification 0/10,Comments
0,8,"ging watch to be sure, but a worthwhile one. J..."
1,10,tly layered examination of a man throughout al...
2,10,delivers a powerfull biopic that shows the da...
3,10,sider myself lucky to be alive to watch Christ...
4,10,d the Oscar goes to... mark-217-30703319 July ...
...,...,...
84043,7,fails sometimes andreverschoor27 July 2023 Lo...
84044,6,'t explain these unrealistic reviews. After tw...
84045,6,ear perfect film. m-nicolaou-86-66125424 July ...
84046,6,eed to be 3 hours scottahaynie24 July 2023 Act...


In [219]:
Oppenhim_File.preprocess_column(column_name='Comments')
Oppenhim_File.extract_dates(column_name='Comments')


In [221]:
Oppenhim_File.show_rows()

Unnamed: 0,Qualification 0/10,Comments,Date
0,8,"ging watch to be sure, but a worthwhile one. J...",
1,10,tly layered examination of a man throughout al...,
2,10,delivers a powerfull biopic that shows the da...,
3,10,sider myself lucky to be alive to watch Christ...,
4,10,d the Oscar goes to... mark-217-30703319 July ...,
5,10,other Cinematic Masterpiece by Christopher Nol...,
6,8,", Christopher Nolan - probably the best blockb...",
7,9,al storytelling and Genius Cinametography Zay-...,
8,10,d EXTREMELY IMPORTANT MOVIE FOR THE PRESENT DA...,
9,8,-4913221 July 2023 Is it just me or did anyone...,


In [222]:
Oppenhim_File.preprocess_column(column_name='Comments')
Oppenhim_File.extract_dates(column_name='Comments')


In [223]:
Oppenhim_File.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84048 entries, 0 to 84047
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Qualification 0/10   84048 non-null  object
 1   Comments             83065 non-null  object
 2   Date                 0 non-null      object
dtypes: object(3)
memory usage: 1.9+ MB


In [224]:
null_count = Oppenhim_File.data['Comments'].isna().sum()
print(f'Number of null values in {"column_name"}: {null_count}')

Number of null values in column_name: 983


In [225]:
#Eliminacion de los valores Null
print(Oppenhim_File.info())
Oppenhim_File.data = Oppenhim_File.data.dropna(subset=['Comments'])
columns_names = Oppenhim_File.show_columns_names()
Oppenhim_File.data = Oppenhim_File.data.dropna(subset=[columns_names[0]])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84048 entries, 0 to 84047
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Qualification 0/10   84048 non-null  object
 1   Comments             83065 non-null  object
 2   Date                 0 non-null      object
dtypes: object(3)
memory usage: 1.9+ MB
None
There is the name of the columns
Qualification 0/10 
Comments
Date


In [226]:
print(Oppenhim_File.describe())
print(Oppenhim_File.show_rows(10))


       Qualification 0/10                                            Comments  \
count                83065                                              83065   
unique                  14                                               2006   
top                     10  ematic Masterpiece deenkhalil21 July 2023 Oppe...   
freq                 40159                                                131   

       Date  
count     0  
unique    0  
top     NaN  
freq    NaN  
  Qualification 0/10                                            Comments  Date
0                   8  ging watch to be sure, but a worthwhile one. J...  None
1                  10  tly layered examination of a man throughout al...  None
2                  10   delivers a powerfull biopic that shows the da...  None
3                  10  sider myself lucky to be alive to watch Christ...  None
4                  10  d the Oscar goes to... mark-217-30703319 July ...  None
5                  10  other Cinematic Masterpiece

In [227]:
unique_values = Oppenhim_File.data[columns_names[0]].unique()
print(unique_values)

['8' '10' '9' '7' '5' '6' '4'
 'Best movie of the year and one of the great war movies of all time.\nJohnDeSando21 July 2023\n"They won\'t fear it until they understand it. And they won\'t understand it until they\'ve used it. Theory will take you only so far." J. Robert Oppenheimer (Cillian Murphy)\n\nThe "it" is the atomic bomb, for which Oppenheimer was called "The Father." Writer'
 '3'
 "Totally overhyped!\nJonSnow_AT21 July 2023\nUnfortunately, the film is totally overhyped.\n\nI fell for it too.\n\nIn retrospect, I wouldn't watch this movie in theaters again and I won't watch it a second time.\n\nDon't get me wrong. The movie is not bad. It's a good movie. But unfortunately, not more.\n\nThe movie is supposed to be terrifying "
 '2' '1'
 "Amazing all around\nfilmfan88724 July 2023\nOppenheimer was overall fantastic! My brief, non-spoiler review:\n\nIt's about Robert Oppenheimer as a person and the journey he experienced in the development and usage of the atomic bomb. The film ex

In [233]:
col_names = Oppenhim_File.show_columns_names()
Oppenhim_File.filter_integer(col_names[0])


There is the name of the columns
Qualification_0/10
Comments
Date


In [235]:
unique_values = Oppenhim_File.data[col_names[0]].unique()
print(unique_values)

['8' '10' '9' '7' '5' '6' '4' '3' '2' '1']


In [228]:
Oppenhim_File.data.rename(columns={columns_names[0]: 'Qualification_0/10'}, inplace=True)

# Convirtiendo la columna 'Qualification_0/10' a tipo int
# Si hay valores no numéricos en la columna, es posible que debas manejarlos antes de hacer esta conversión
Oppenhim_File.data['Qualification_0/10'] = Oppenhim_File.data['Qualification_0/10'].astype(int)

# Asegurándote de que la columna 'Comments' es de tipo str
Oppenhim_File.data['Comments'] = Oppenhim_File.data['Comments'].astype(str)

ValueError: invalid literal for int() with base 10: 'Best movie of the year and one of the great war movies of all time.\nJohnDeSando21 July 2023\n"They won\'t fear it until they understand it. And they won\'t understand it until they\'ve used it. Theo