# import

In [73]:
from os import rename
from pathlib import Path

import pandas as pd

# Dataframes

## from dict

In [68]:
data = {
    'Country': ['Belgium', 'India', 'Brazil'],
    'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
    'Population': [100, 200, 300]
}
df = pd.DataFrame(data, columns=['Country', 'Capital', 'Population'])

## from files

### CSV

In [None]:
# Simple read file
df = pd.read_csv("../data/products.csv")

"""
Most commons params

| Parameter                   | Description                                                | Common Use Case                                             |
| --------------------------- | ---------------------------------------------------------- | ----------------------------------------------------------- |
| filepath_or_buffer          | File path (string or URL) or file-like object              | Required to point to the CSV file                           |
| sep                         | Field delimiter (default is `','`)                         | Use `\t` for TSV, `;` for European-style CSV                |
| header                      | Row number to use as column names (default is `0`)         | Use `None` if there's no header                             |
| names                       | List of column names to use (overrides `header`)           | Useful when header is missing or needs renaming             |
| index_col                   | Column(s) to set as index                                  | Often used to set time or ID columns as index               |
| usecols                     | Subset of columns to read (by name or index)               | Improves performance when not all columns are needed        |
| dtype                       | Dict of column types                                       | Enforces type consistency or memory optimization            |
| parse_dates                 | List of columns to parse as datetime                       | Automatically converts date strings into `datetime` objects |
| infer_datetime_format       | Speed up datetime parsing when format is consistent        | Faster parsing with consistent formats                      |
| na_values                   | List of strings to recognize as NaN                        | Customize missing value handling (`["NA", "NULL", "n/a"]`)  |
| skiprows                    | Number of lines to skip at the start                       | Useful for skipping metadata or junk rows                   |
| nrows                       | Number of rows to read                                     | Used for sampling large files                               |
| chunksize                   | Return an iterator with chunks of data (number of lines)   | Enables streaming large datasets in pieces                  |
| encoding                    | Encoding of the file (default is `'utf-8'`)                | Use `'ISO-8859-1'` or `'latin1'` for special characters     |
| compression                 | For compressed files (`'gzip'`, `'zip'`, etc.)             | Automatically handles zipped data files                     |

"""



In [None]:
"""
    This would read a CSV in chunks from S3,
    parse timestamps,
    optimize memory by enforcing types,
    and limit to specific useful columns.
"""
df = pd.read_csv(
    "s3://bucket/data.csv",
    sep=",",
    usecols=["id", "timestamp", "event_type"],
    parse_dates=["timestamp"],
    dtype={"id": "int32", "event_type": "category"},
    chunksize=100000
)

In [None]:
"""
    Treating date fields
"""
df = pd.read_csv(f"{Path().absolute()}/netflix_titles.csv",
                 parse_dates=['date_added'],
                 date_format='%B %d, %Y')

# It's required to use str.strip() because some of the values
# have a whitespace at the beginning
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), format='%B %d, %Y')

df.dtypes

## from database

In [None]:
from sqlalchemy import create_engine

# Database connection settings
host = "your_host"
port = "5432"
database = "your_database"
user = "your_username"
password = "your_password"

# Create the SQLAlchemy connection string
connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"

# Create the database engine
engine = create_engine(connection_string)

# Example query
query = """
        SELECT id, name, created_at
        FROM your_table
        WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' \
        """

# Read the query into a DataFrame
df = pd.read_sql(query, engine)

# Preview data
print(df.head())


## basic info

In [41]:
# df.shape   # (rows,columns)
# df.index   # Describe index
# df.columns # Describe columns
# df.info()  # Info of df
df.count()  # Number of non-NA values

Unnamed: 0,Population
count,3.0
mean,200.0
std,100.0
min,100.0
25%,150.0
50%,200.0
75%,250.0
max,300.0


# Saving dataframes

## to files

In [None]:
df.to_csv('data.csv', index=False)

"""
| Parameter             | Description                                           | Common Use Case                                         |
| --------------------- | ----------------------------------------------------- | ------------------------------------------------------- |
| path_or_buf           | File path or file-like object                         | Local path or cloud storage path                        |
| sep                   | Field delimiter (default is `','`)                    | Use `\t` for TSV or `;` in European formats             |
| index                 | Whether to write row indices (default `True`)         | Set to `False` to exclude index column                  |
| columns               | List of columns to write                              | Export only selected columns                            |
| header                | Whether to write column names (default `True`)        | Set to `False` to exclude header row                    |
| mode                  | Write mode (`'w'` or `'a'`)                           | `'a'` allows appending to existing files                |
| encoding              | Output encoding (default `'utf-8'`)                   | Use `'utf-8-sig'` for Excel compatibility or `'latin1'` |
| compression           | Compression format (`'gzip'`, `'zip'`, `'bz2'`, etc.) | Compress files during export                            |
| line_terminator       | End-of-line character (`'\n'`, `'\r\n'`)              | Control line endings across platforms                   |
| float_format          | Format for floating-point numbers                     | Useful for consistent rounding (e.g., `"%.2f"`)         |
| date_format           | Format for datetime columns                           | Standardize timestamp export (`"%Y-%m-%d"`)             |
| quoting               | CSV quoting behavior (from `csv` module)              | Handle special characters in text                       |
| chunksize             | Number of rows to write at a time                     | Useful for large data export                            |
| na_rep                | Representation for NaN/missing values                 | Replace NaN with specific text (e.g., `'-'`)            |

"""

# Tranformations

## drop columns

In [13]:
df2 = df.drop(columns=['Capital'])
df2

Unnamed: 0,Country
0,Belgium
1,India
2,Brazil


## drop rows

In [5]:
df2 = df.drop(0)  # index-based, drops first row
df2

Unnamed: 0,Country,Capital
0,Belgium,Brussels
1,India,New Delhi
2,Brazil,Brasilia


## applying functions

In [61]:
f = lambda x: x * 3
df2 = df.apply(f)

In [72]:
def concat_country_capital(row):
    return f"{row['Country']} ({row['Capital']})"


df2 = df.copy()

# Using axis = 1, so the parameters passed to the function
# is a PD Series representing the row
df2['new_column'] = df2.apply(concat_country_capital, axis=1)
df2

Unnamed: 0,Country,Capital,Population,new_column
0,Belgium,Brussels,100,Belgium (Brussels)
1,India,New Delhi,200,India (New Delhi)
2,Brazil,Brasilia,300,Brazil (Brasilia)


# Combining Data

## df merge

In [105]:

# reading two csv files
products_df = pd.read_csv("../data/products.csv")
sales_df = pd.read_csv("../data/sales.csv", parse_dates=["sale_date"])

merged_sales_df = pd.merge(products_df, sales_df,
                           on='product_id',
                           how='left')

merged_sales_df

Unnamed: 0,product_id,product_name,category,price,sale_id,quantity,sale_date
0,1,Laptop,Electronics,1200.0,101.0,1.0,2025-06-01
1,1,Laptop,Electronics,1200.0,103.0,1.0,2025-06-03
2,2,Smartphone,Electronics,800.0,102.0,2.0,2025-06-02
3,2,Smartphone,Electronics,800.0,106.0,1.0,2025-06-05
4,3,Headphones,Accessories,150.0,104.0,3.0,2025-06-03
5,4,Monitor,Electronics,300.0,,,NaT


## finding absent values

In [110]:
# Load the CSV files
products_df = pd.read_csv("../data/products.csv")
sales_df = pd.read_csv("../data/sales.csv")

# Find products without sales
products_with_sales = sales_df["product_id"].unique()

# What doest the trick if the ~ operator, it works like "not in"
products_without_sales = products_df[~products_df["product_id"].isin(products_with_sales)]

print("Products without sales:")
print(products_without_sales)

Products without sales:
   product_id product_name     category  price
3           4      Monitor  Electronics  300.0


# Calculations

## Sort by index

In [16]:
df2 = df.sort_index(ascending=True)

## Sort by values

In [18]:
df2 = df.sort_values(by='Capital', ascending=True)

## cumulative sum

In [None]:
df2 = df.copy()

df2['Cumulative_pop'] = df2['Population'].cumsum()

## min, max, mean, median, describe stats

In [45]:
# df['Population'].min()
# df['Population'].max()
# df['Population'].mean() # = average
# df['Population'].median() # middle value in a sorted dataset
# df['Population'].describe()

np.float64(200.0)

## group by and sum

In [115]:
sales_summary = (merged_sales_df.groupby('product_id')['quantity'].sum()
                 .sort_values(ascending=False)
                 .reset_index())
sales_summary

Unnamed: 0,product_id,quantity
0,2,3.0
1,3,3.0
2,1,2.0
3,4,0.0


## finding the highest sale

In [109]:
# Group sales by product_id and sum the quantity
# Calculate total revenue per product
sales_summary = (
    merged_sales_df
    .groupby(["product_id", "product_name", "category", "price"])
    .agg(total_quantity=("quantity", "sum"),
         total_revenue=("quantity", lambda x: (x * merged_sales_df.loc[x.index, "price"]).sum()))
    .reset_index()
)

# Sort by quantity to find the most sold product
most_sold_product = sales_summary.sort_values(by="total_revenue", ascending=False).head(1)

print("Most sold product with total revenue:")
print(most_sold_product)

Most sold product with total revenue:
   product_id product_name     category   price  total_quantity  total_revenue
0           1       Laptop  Electronics  1200.0             2.0         2400.0


In [176]:
nums = "Alisson Lima"

nums[::-1]

'amiL nossilA'

In [177]:
class ListNode:
    def __init__(self, val=0, next=None):
        self.val = val
        self.next = next

In [189]:
def reverse_linked_list(head: ListNode) -> ListNode:
    curr = head
    before = None
    last = None
    while curr:
        next = curr.next
        curr.next = before
        before = curr
        last = curr
        curr = next

    return last


In [190]:
# Helper functions to create and print linked list for testing
def create_linked_list(values):
    head = ListNode(values[0])
    curr = head
    for v in values[1:]:
        curr.next = ListNode(v)
        curr = curr.next
    return head

def print_linked_list(head):
    vals = []
    curr = head
    while curr:
        vals.append(str(curr.val))
        curr = curr.next
    print("->".join(vals))

# Example usage
head = create_linked_list([1, 2, 3, 4, 5])
print("Original list:")
print_linked_list(head)
reversed_head = reverse_linked_list(head)
print("Reversed list:")
print_linked_list(reversed_head)

Original list:
1->2->3->4->5
Reversed list:
5->4->3->2->1
