In [29]:
import numpy as np
import pandas as pd

#import text_exploration as texplo

In [2]:
# A few dataframes for testing:

data_multitype = {
    'Column1': [1, 2.5, 3, 4.8, 5],     # Mixed integers and floats
    'Column2': ['A', 'B', 'C', 'D', 5], # Strings and an integer
    'Column3': [True, False, np.nan, 1, 'Mixed']  # Booleans, NaN, integer, and string
}

df_multitype = pd.DataFrame(data_multitype)

data_nans = {
    'Column1': [1, np.nan, np.nan, 4, 5],       # Contains NaN values
    'Column2': ['A', 'B', np.nan, np.nan, 'E'], # Contains NaN values
    'Column3': [True, np.nan, np.nan, True, False]  # Contains NaN values
}

df_nans = pd.DataFrame(data_nans)

df_maps1 = pd.DataFrame(
    {
        'a': [1, 2, 3, 4, 3],
        'b': [1, 3, 1, 3, 1],
        'c': ['lollo', 'gigio', 'lollo', 'gigio', 'lollo']
    })

df_maps2 = pd.DataFrame(
    {
        'a': [1, 2, 3, 4],
        'b': [1, 1, 3, 5]
    }
)

data_text = {
    'Column1': ['##apple##', 'banana!!', '%%cherry%%', 'date@@', '@@elderberry@@'],  # Repeated special characters
    'Column2': [' A1 ', ' B2 ', ' C3 ', ' D4 ', ' E5 '],                             # Leading/trailing spaces
    'Column3': ['@hello@', '!!world!!', ' python ', 'rocks@@', None]                # Mixed issues
}

df_text = pd.DataFrame(data_text)

# 1. Data Types

In [11]:
import data_types as types

# Print out the names of all the functions in the data_types module
print([item for item in dir(types) if not item.startswith('__')])

['col_types', 'nan2nan', 'pd']


### 1.1 Explore types with `col_types()`
`types.col_types(df)` prints out the number of element types found in each column of a given pandas Data Frame:

In [12]:
help(types.col_types)

Help on function col_types in module data_types:

col_types(df)
    '
    In: a data frame df
    Out: prints the number of element types found in each column of a given df



In [42]:
print(df_multitype,'\n')

types.col_types(df_multitype)

   Column1 Column2 Column3
0      1.0       A    True
1      2.5       B   False
2      3.0       C     NaN
3      4.8       D       1
4      5.0       5   Mixed 

Column1
<class 'float'>    5
Name: count, dtype: int64 
 NaN values:  0 

Column2
<class 'str'>    4
<class 'int'>    1
Name: count, dtype: int64 
 NaN values:  0 

Column3
<class 'bool'>     2
<class 'float'>    1
<class 'int'>      1
<class 'str'>      1
Name: count, dtype: int64 
 NaN values:  1 



### 1.2 Explore Matching NaNs with `nan2nan`
`types.nan2nan` serves to answer the question: Are NaN values in col_1 also NaN in col 2?

In [13]:
help(types.nan2nan)

Help on function nan2nan in module data_types:

nan2nan(df, col_1_name, col_2_name)
    Given 2 columns in a data frame df, this function
    returns a list with the number of NaNs in the first,
    The number of NaNs in the second,
    And the number fo rows where both cols have NaN value



In [41]:
print(df_nans,'\n')

types.nan2nan(df_nans, 'Column1', 'Column2')

   Column1 Column2 Column3
0      1.0       A    True
1      NaN       B     NaN
2      NaN     NaN     NaN
3      4.0     NaN    True
4      5.0       E   False 

NaN values in Column1:  2 

NaN values in Column2:  2 

NaN values in the same row for both Column1 and Column2:  1


# 2. Mappings

In [26]:
import df_mappings as map

# Print out the names of all the functions in the df_mappings module
print([item for item in dir(map) if not item.startswith('__')])

['foreign_k2k', 'map_col2col', 'occurrences', 'pd', 'xs2xs', 'xs2ys']


### 2.1 Which values are in the same position?
`maps.map_col2col`: given 2 columns in a data frame (or any two same length columns), one might be interested in how they map onto each other. Is it a 1 to 1 mapping? Does it define a function? And if so, is the function injective of surjective?

The following function, given col_1 and col_2, returns, for each value in col_1, how many values it is associated to in col_2:

In [18]:
help(map.map_col2col)

Help on function map_col2col in module df_mappings:

map_col2col(series_1, series_2)



In [None]:
# Try it out here:
print(df_maps1, '\n')
print(map_col2col(df_maps1.b, df_maps1.c))

   a  b      c
0  1  1  lollo
1  2  3  gigio
2  3  1  lollo
3  4  3  gigio
4  3  1  lollo 

   gigio  lollo
1  False   True
3   True  False


By calling `map_count(col_1, col_2)` and then `map_count(col_2, col_1)`, one will be able to determine the relation between the two columns.

### 2.2 Can 2 same type series get a 1 to 1 mapping?
`map.foreign_k2k`: are you wandering whether two columns from different tables have the same unique values?

In [19]:
help(map.foreign_k2k)

Help on function foreign_k2k in module df_mappings:

foreign_k2k(series_1, series_2, df1_name='table1', df2_name='table2', n_matches=-1, mode='>')



In [9]:

map.foreign_k2k(df_maps2.a, df_maps2.b)

Unnamed: 0,table1.a_values,matches_in_table2.b > -1
0,1,2
1,2,0
2,3,1
3,4,0


### 2.3 xs2xs

In [32]:
iter_1 = [1, 2, 3, 4, 5, 6]
iter_2 = [1, 5, 3, 3, 4, 5, 8]

In [33]:
pd.concat([iter_1, iter_2])

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [34]:
print(map.xs2xs(iter_1, iter_2))

NameError: name 'concat' is not defined

# 3. Explore your text fields

In [20]:
import text_exploration as texplo

# Print out the names of all the functions in the text_exploration module
print([item for item in dir(texplo) if not item.startswith('__')])

['finals', 'initials', 'pd']


### 3.1 Need trimming?
A long column of strings sometimes starts with undesirable characters like brackets or others. Let's quickly check what are the initials of our strings:

In [35]:
help(texplo.initials)

Help on function initials in module text_exploration:

initials(s)



In [34]:
help(texplo.finals)

Help on function finals in module text_exploration:

finals(s)



In [33]:
print(df_text, '\n')

print(texplo.finals(df_text.Column3))

          Column1 Column2    Column3
0       ##apple##     A1     @hello@
1        banana!!     B2   !!world!!
2      %%cherry%%     C3     python 
3          date@@     D4     rocks@@
4  @@elderberry@@     E5        None 

Column3
@       2
!       1
        1
None    1
Name: count, dtype: int64
