In [17]:
import pandas as pd
import numpy as np

# Creating a sample DataFrame
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, np.nan, np.nan, 8, 9],
    'C': ['foo', 'bar', 'foo', 'bar', 'foo'],
    'D': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}

df = pd.DataFrame(data)
df['D'] = pd.to_datetime(df['D'])

print("Original DataFrame:")
print(df)

# 1. Head and Tail
print("\nHead of the DataFrame:")
print(df.head())

print("\nTail of the DataFrame:")
print(df.tail())

# 2. Describe
print("\nStatistical Summary:")
print(df.describe())

# 3. Info
print("\nDataFrame Info:")
df.info()

# 4. Isnull and Notnull
print("\nCheck for null values:")
print(df.isnull())

print("\nCheck for non-null values:")
print(df.notnull())

# 5. Dropna
print("\nDrop rows with missing values:")
print(df.dropna())

# 6. Fillna
print("\nFill missing values with 0:")
print(df.fillna(0))

# 7. Groupby
print("\nGroup by column 'C':")
print(df.groupby('C').sum(numeric_only=True))

# 8. Apply (with lambda)
print("\nApply a lambda function to column 'A':")
print(df['A'].apply(lambda x: x*2))

# 9. Sort_values
print("\nSort DataFrame by column 'A':")
print(df.sort_values(by='A'))

# 10. Pivot_table
print("\nCreate a pivot table:")
pivot_df = pd.pivot_table(df, values='A', index=['C'], columns=['D'], aggfunc=np.sum)
print(pivot_df)

# 11. Merge
df2 = pd.DataFrame({
    'C': ['foo', 'bar', 'baz'],
    'E': [10, 20, 30]
})
print("\nMerge DataFrames:")
merged_df = pd.merge(df, df2, on='C', how='left')
print(merged_df)

# 12. Concat
print("\nConcatenate DataFrames:")
concat_df = pd.concat([df, df2], axis=0)
print(concat_df)

# 13. Dtypes
print("\nData types of each column:")
print(df.dtypes)

# 14. Unique
print("\nUnique values in column 'C':")
print(df['C'].unique())

# 15. Value_counts
print("\nValue counts for column 'C':")
print(df['C'].value_counts())

# 16. Rename
print("\nRename columns:")
df_renamed = df.rename(columns={'A': 'Alpha', 'B': 'Beta'})
print(df_renamed)

# 17. Set_index and Reset_index
print("\nSet column 'C' as index:")
df_indexed = df.set_index('C')
print(df_indexed)

print("\nReset index:")
print(df_indexed.reset_index())

# 18. Duplicate removal
print("\nCheck for duplicates:")
print(df.duplicated())

print("\nDrop duplicates:")
print(df.drop_duplicates())

# 19. Query
print("\nQuerying DataFrame where column A > 2:")
print(df.query('A > 2'))

# 20. Transform
print("\nTransform column 'A' with a custom function:")
print(df['A'].transform(lambda x: x + 10))

# 21. Melt
print("\nMelt DataFrame:")
melted_df = pd.melt(df, id_vars=['C'], value_vars=['A', 'B'])
print(melted_df)

# 22. Join
print("\nJoin DataFrames:")
joined_df = df.join(df2.set_index('C'), on='C', how='left')
print(joined_df)

# 23. Resample
print("\nResample by month (if dates were more varied):")
df_resampled = df.resample('M', on='D').sum()
print(df_resampled)

# 24. Rolling
print("\nRolling window calculation (sum over 2 periods for column 'A'):")
print(df['A'].rolling(window=2).sum())

# 25. Shift
print("\nShift column 'A' by 1 period:")
print(df['A'].shift(1))

# 26. Correlation
print("\nCorrelation between columns:")
print(df.corr(numeric_only=True))

# 27. Covariance
print("\nCovariance between columns:")
print(df.cov(numeric_only=True))

# 28. Cumulative sum
print("\nCumulative sum of column 'A':")
print(df['A'].cumsum())

# 29. Nlargest and Nsmallest
print("\n3 largest values in column 'A':")
print(df['A'].nlargest(3))

print("\n3 smallest values in column 'A':")
print(df['A'].nsmallest(3))

# 30. Explode
df_explode = pd.DataFrame({
    'A': [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
})
print("\nExplode a list-like column:")
print(df_explode.explode('A'))


Original DataFrame:
     A    B    C          D
0  1.0  5.0  foo 2023-01-01
1  2.0  NaN  bar 2023-01-02
2  NaN  NaN  foo 2023-01-03
3  4.0  8.0  bar 2023-01-04
4  5.0  9.0  foo 2023-01-05

Head of the DataFrame:
     A    B    C          D
0  1.0  5.0  foo 2023-01-01
1  2.0  NaN  bar 2023-01-02
2  NaN  NaN  foo 2023-01-03
3  4.0  8.0  bar 2023-01-04
4  5.0  9.0  foo 2023-01-05

Tail of the DataFrame:
     A    B    C          D
0  1.0  5.0  foo 2023-01-01
1  2.0  NaN  bar 2023-01-02
2  NaN  NaN  foo 2023-01-03
3  4.0  8.0  bar 2023-01-04
4  5.0  9.0  foo 2023-01-05

Statistical Summary:
              A         B                    D
count  4.000000  3.000000                    5
mean   3.000000  7.333333  2023-01-03 00:00:00
min    1.000000  5.000000  2023-01-01 00:00:00
25%    1.750000  6.500000  2023-01-02 00:00:00
50%    3.000000  8.000000  2023-01-03 00:00:00
75%    4.250000  8.500000  2023-01-04 00:00:00
max    5.000000  9.000000  2023-01-05 00:00:00
std    1.825742  2.081666     

  pivot_df = pd.pivot_table(df, values='A', index=['C'], columns=['D'], aggfunc=np.sum)


In [23]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


# Sample DataFrame for basic operations
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, np.nan, np.nan, 8, 9],
    'C': ['foo', 'bar', 'foo', 'bar', 'foo'],
    'D': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}

df = pd.DataFrame(data)
df['D'] = pd.to_datetime(df['D'])

# 1. DataFrame Creation and Input/Output
print("\nReading and Writing DataFrames:")
df.to_csv('output.csv')  # Write to CSV
df_from_csv = pd.read_csv('output.csv')  # Read from CSV
print("Read from CSV:")
print(df_from_csv.head())

# 2. Indexing and Selection
print("\nIndexing and Selection:")
print("Using loc:")
print(df.loc[0:2, ['A', 'C']])
print("Using iloc:")
print(df.iloc[0:2, [0, 2]])
print("Using at:")
print(df.at[1, 'A'])
print("Using iat:")
print(df.iat[1, 0])
print("Using xs:")
print(df.xs(0))
print("Sample:")
print(df.sample(3))

# 3. Advanced Data Manipulation
print("\nAdvanced Data Manipulation:")
df_reindexed = df.reindex([0, 2, 4])
print("Reindexing DataFrame:")
print(df_reindexed)
print("Stack:")
print(df.stack())
print("Unstack:")
print(df.stack().unstack())
print("Cutting data into bins:")
bins = pd.cut(df['A'].astype(float).to_numpy(), bins=3)
print(bins)
print("Get Dummies:")
dummies = pd.get_dummies(df['C'])
print(dummies)

# 4. Time Series and Date Functionality
print("\nTime Series and Date Functionality:")
print("Convert to datetime:")
df['D'] = pd.to_datetime(df['D'])
print(df['D'])
print("Resample time series data:")
df_resampled = df.resample('M', on='D').sum()
print(df_resampled)
print("Shift:")
print(df['A'].shift(1))
print("Diff:")
print(df['A'].diff())
print("Asfreq:")
print(df.set_index('D').asfreq('D'))

# 5. Aggregation and Grouping
print("\nAggregation and Grouping:")
print("Aggregate with custom function:")
agg_df = df.groupby('C').agg({'A': 'mean', 'B': 'sum'})
print(agg_df)
print("Crosstab:")
cross_tab = pd.crosstab(df['C'], df['B'])
print(cross_tab)
print("Pipe example:")
df_piped = df.pipe(lambda x: x.drop(columns=['B']))
print(df_piped)
print("Ranking values in column A:")
print(df['A'].rank())

# 6. String Operations
print("\nString Operations:")
df['C'] = df['C'].str.upper()
print("Contains 'FOO':")
print(df['C'].str.contains('FOO'))
print("Replace 'FOO' with 'BAZ':")
print(df['C'].str.replace('FOO', 'BAZ'))
print("Extract 'A' and 'B':")
df['Extracted'] = df['C'].str.extract(r'([A-Z]+)')
print(df['Extracted'])
print("Split 'C' column by 'A':")
df['Split'] = df['C'].str.split
print(df['Split'])

# 7. Window Functions
print("\nWindow Functions:")
print("Expanding sum:")
print(df['A'].expanding().sum())
print("Exponentially weighted mean:")
print(df['A'].ewm(span=2).mean())

# 8. Sparse Data Structures
print("\nSparse Data Structures:")
# sparse_df = pd.DataFrame(np.random.randn(10, 4)).to_sparse()  # Remove this line
sparse_df = pd.DataFrame(np.random.randn(10, 4))
sparse_df.iloc[0:8, :] = np.nan # Introduce sparsity by setting most values to NaN
print("Sparse DataFrame:")
print(sparse_df)

# 9. Reshaping and Pivoting
print("\nReshaping and Pivoting:")
print("Crosstab example:")
cross_tab = pd.crosstab(df['C'], df['B'])
print(cross_tab)
print("Pivoting DataFrame:")
df_pivoted = df.pivot(index='D', columns='C', values='A')
print(df_pivoted)

# 10. Miscellaneous Functions
print("\nMiscellaneous Functions:")
print("Clipping data:")
print(df['A'].clip(lower=2))
print("Evaluate expression on DataFrame:")
df['Eval'] = df.eval('A + B')
print(df['Eval'])
print("Where condition A > 3:")
print(df.where(df['A'] > 3))

# 11. Data Cleaning
print("\nData Cleaning:")
print("Replacing NaN in column A with 0:")
df['A'].replace(np.nan, 0, inplace=True)
print(df['A'])
print("Dropping column 'Eval':")
df.pop('Eval')
print(df)
print("Trimming values at input thresholds:")
print(df['A'].clip(lower=2, upper=4))



Reading and Writing DataFrames:
Read from CSV:
   Unnamed: 0    A    B    C           D
0           0  1.0  5.0  foo  2023-01-01
1           1  2.0  NaN  bar  2023-01-02
2           2  NaN  NaN  foo  2023-01-03
3           3  4.0  8.0  bar  2023-01-04
4           4  5.0  9.0  foo  2023-01-05

Indexing and Selection:
Using loc:
     A    C
0  1.0  foo
1  2.0  bar
2  NaN  foo
Using iloc:
     A    C
0  1.0  foo
1  2.0  bar
Using at:
2.0
Using iat:
2.0
Using xs:
A                    1.0
B                    5.0
C                    foo
D    2023-01-01 00:00:00
Name: 0, dtype: object
Sample:
     A    B    C          D
2  NaN  NaN  foo 2023-01-03
3  4.0  8.0  bar 2023-01-04
4  5.0  9.0  foo 2023-01-05

Advanced Data Manipulation:
Reindexing DataFrame:
     A    B    C          D
0  1.0  5.0  foo 2023-01-01
2  NaN  NaN  foo 2023-01-03
4  5.0  9.0  foo 2023-01-05
Stack:
0  A                    1.0
   B                    5.0
   C                    foo
   D    2023-01-01 00:00:00
1  A      