In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/content/non-financial-nse-2014-2025.csv')

No columns with all null values found.


In [4]:
# Check for columns with all null values (except the header)
null_columns = df.columns[df.isnull().all()]

if len(null_columns) > 0:
  print("Columns with all null values:")
  for col in null_columns:
    print(col)
else:
  print("No columns with all null values found.")

No columns with all null values found.


In [8]:
rows_with_no_nulls = df.dropna(how='any').shape[0]
print(f"Number of rows with zero null values: {rows_with_no_nulls}")

Number of rows with zero null values: 900


In [7]:
df.shape

(2343, 132)

In [9]:
null_distribution_per_row = df.isnull().sum(axis=1)
print("Distribution of null values per row:")
display(null_distribution_per_row.value_counts().sort_index())

Distribution of null values per row:


Unnamed: 0,count
0,900
1,12
2,13
3,6
4,5
...,...
126,14
127,29
128,17
129,24


In [11]:
def count_middle_nulls(row):
  """Counts the number of null values in a row surrounded by non-null values."""
  null_indices = row.index[row.isnull()]
  non_null_indices = row.index[row.notnull()]
  middle_null_count = 0

  if len(null_indices) == 0 or len(non_null_indices) < 2:
    return 0

  # Get the indices of the date columns (excluding 'Company Name')
  date_columns = df.columns[1:]

  for null_col in null_indices:
    try:
      # Find the position of the null column within the date columns
      null_pos = date_columns.get_loc(null_col)

      # Check for non-null values before and after the null value
      has_before = any(date_columns[i] in non_null_indices for i in range(null_pos))
      has_after = any(date_columns[i] in non_null_indices for i in range(null_pos + 1, len(date_columns)))

      if has_before and has_after:
        middle_null_count += 1
    except KeyError:
      # This handles the case where the null column is not a date column (like 'Company Name')
      continue

  return middle_null_count

# Apply the function to each row and get the count of middle nulls per row
middle_nulls_per_row = df.apply(count_middle_nulls, axis=1)

print("Distribution of middle null values per row:")
# Display the value counts for rows where the middle null count is greater than 0
display(middle_nulls_per_row[middle_nulls_per_row > 0].value_counts().sort_index())

Distribution of middle null values per row:


Unnamed: 0,count
1,33
2,24
3,13
4,11
5,14
6,14
7,5
8,9
9,6
10,4


In [12]:
# Identify rows with less than 5 middle null values
rows_to_interpolate = df[middle_nulls_per_row < 5].index

# Select only the date columns for interpolation
date_columns = df.columns[1:]

# Interpolate the null values in the identified rows for date columns
df.loc[rows_to_interpolate, date_columns] = df.loc[rows_to_interpolate, date_columns].interpolate(method='linear', axis=1)

print(f"Interpolated null values in {len(rows_to_interpolate)} rows that had less than 5 middle nulls.")

Interpolated null values in 2219 rows that had less than 5 middle nulls.


In [13]:
rows_with_no_nulls = df.dropna(how='any').shape[0]
print(f"Number of rows with zero null values: {rows_with_no_nulls}")

Number of rows with zero null values: 982


In [14]:
df_cleaned = df.dropna(how='any')

print(f"Original number of rows: {df.shape[0]}")
print(f"Number of rows after removing rows with nulls: {df_cleaned.shape[0]}")

# You can now work with df_cleaned which contains no null values

Original number of rows: 2343
Number of rows after removing rows with nulls: 982


In [15]:
df_cleaned.shape

(982, 132)

In [16]:
df_cleaned.to_csv('Stock_return-cleaned.csv', index=False)