In [42]:
import pandas as pd
import json

def json_to_df(json_file_path):
    try:
        with open(json_file_path, 'r') as j:
            data = json.loads(j.read())
    except FileNotFoundError:
        print(f"Error: File not found at {json_file_path}")
        return None

    if isinstance(data, list) and all(isinstance(item, dict) for item in data):
      df = pd.DataFrame(data)
      return df
    elif isinstance(data, dict):
      df = pd.DataFrame([data])
      return df
    else:
      print("Error: Invalid JSON format for DataFrame conversion.")
      return None

# Example usage
file_path = '/content/annotations (3).json'
df = json_to_df(file_path)

df

Unnamed: 0,_id,reel_id,isSafe,codemixed,unsafeType
0,677382209bf7c70d197440b4,1468676857377451,1,[English],
1,677382219bf7c70d197440b5,1468676857377451,1,[English],
2,6773822a9bf7c70d197440b6,1078528213980856,1,[Bengali],
3,677382379bf7c70d197440b7,1738001793692904,0,[Bengali],Adult
4,677382409bf7c70d197440b8,1043909007248349,0,[Others],Adult
...,...,...,...,...,...
795,677bd3d80231fcf216f094d2,807246464659259,1,[Bengali],
796,677bd4670231fcf216f094d3,1202448027685866,1,[Bengali],
797,677bd5420231fcf216f094d4,560530613387498,0,[Bengali],Adult
798,677bd6380231fcf216f094d5,854074033379275,0,[Bengali],Adult


In [43]:
df['isSafe'].value_counts()

Unnamed: 0_level_0,count
isSafe,Unnamed: 1_level_1
1,509
0,291


In [44]:
df['reel_id'].value_counts()

Unnamed: 0_level_0,count
reel_id,Unnamed: 1_level_1
900553318861344,5
1154843582259253,3
922983583085691,3
1165503301150914,3
1250788619281433,3
...,...
1866865830468541,1
6129632457104384,1
889338516741869,1
1038868304211575,1


In [45]:
df[df['reel_id']=='900553318861344']

Unnamed: 0,_id,reel_id,isSafe,codemixed,unsafeType
726,677aab10b90f8b817515739a,900553318861344,1,"[Bengali, English]",
727,677aab19b90f8b817515739b,900553318861344,1,"[Bengali, English]",
728,677aab1b4c3bf9d0702eef4d,900553318861344,1,"[Bengali, English]",
729,677aab1ce6e4d480d48a384d,900553318861344,1,"[Bengali, English]",
730,677aab1d4c3bf9d0702eef4e,900553318861344,1,"[Bengali, English]",


In [46]:
reel_id_counts = df['reel_id'].value_counts()
num_reel_ids_gt1 = sum(1 for count in reel_id_counts if count > 1)
num_reel_ids_gt1

46

In [47]:
df['reel_id'].nunique()

745

In [48]:
df['codemixed'].value_counts()

Unnamed: 0_level_0,count
codemixed,Unnamed: 1_level_1
[Bengali],478
"[Bengali, English]",93
[Hindi],31
[Others],29
[English],23
"[Bengali, Hindi]",12
"[English, Bengali]",8
"[Hindi, English]",3
"[Others, Bengali]",2
"[Hindi, Bengali]",2


In [49]:
df['unsafeType'] = df['unsafeType'].fillna("Safe")

In [50]:
df['unsafeType'].value_counts()

Unnamed: 0_level_0,count
unsafeType,Unnamed: 1_level_1
Safe,509
Adult,216
Harmful,71
Suicidal,4


In [51]:
new_df = df.groupby('reel_id').agg(
        Count=('reel_id', 'count'),
        isSafe=('isSafe', list),
        Codemixed=('codemixed', list),
        UnsafeType=('unsafeType', list)
    ).reset_index()

new_df

Unnamed: 0,reel_id,Count,isSafe,Codemixed,UnsafeType
0,1000188341415358,1,[1],[[English]],[Safe]
1,1006146964533997,1,[1],[[Bengali]],[Safe]
2,1007123741459823,1,[0],[None],[Adult]
3,1007420607202244,1,[1],[[Bengali]],[Safe]
4,1008519870870927,1,[1],[[Bengali]],[Safe]
...,...,...,...,...,...
740,997262608752854,1,[0],[[Bengali]],[Adult]
741,997300345735398,1,[0],"[[Bengali, English]]",[Harmful]
742,998187401354206,1,[1],[[Bengali]],[Safe]
743,999558344609179,1,[1],[[Bengali]],[Safe]


In [52]:
new_df[new_df['Count']>2]

Unnamed: 0,reel_id,Count,isSafe,Codemixed,UnsafeType
104,1154843582259253,3,"[1, 1, 1]","[[Bengali], [Bengali], [Bengali]]","[Safe, Safe, Safe]"
110,1165503301150914,3,"[1, 1, 1]","[[Bengali], [Bengali], [Bengali]]","[Safe, Safe, Safe]"
155,1250788619281433,3,"[0, 0, 0]","[[Bengali, English], [Bengali, English], [Beng...","[Harmful, Harmful, Harmful]"
512,545597244823178,3,"[0, 0, 0]","[[Bengali, English], [Bengali, English], [Beng...","[Adult, Adult, Adult]"
558,595783629789932,3,"[1, 1, 1]","[[Bengali], [Bengali], [Bengali]]","[Safe, Safe, Safe]"
678,900553318861344,5,"[1, 1, 1, 1, 1]","[[Bengali, English], [Bengali, English], [Beng...","[Safe, Safe, Safe, Safe, Safe]"
695,922983583085691,3,"[1, 1, 1]","[[Bengali], [Bengali], [Bengali]]","[Safe, Safe, Safe]"


In [53]:
li = new_df['Codemixed'].iloc[104]
li

[['Bengali'], ['Bengali'], ['Bengali']]

In [54]:
for i in li:
  print(i)

['Bengali']
['Bengali']
['Bengali']


In [55]:
# prompt: In the column codemixed, there are 2D lists containing strings like [['A', 'B'], [C]]
# You have to check if "Hindi" or "Other" contains in any of the values in that list. find
# how many column contain this.

def check_hindi_other(codemixed_col):
    count = 0
    for sublist in codemixed_col:
        if sublist is not None:
          for item in sublist:
              if "Hindi" in item or "Other" in item:
                  count += 1
                  break  # Exit inner loop once "Hindi" or "Other" is found in a sublist
    return count

# Apply the function to the 'Codemixed' column and create a new column 'Hindi_Other_Count'
new_df['Hindi_Other_Count'] = new_df['Codemixed'].apply(check_hindi_other)

# Calculate the number of rows where 'Hindi_Other_Count' is greater than 0
num_rows_with_hindi_other = len(new_df[new_df['Hindi_Other_Count'] > 0])

num_rows_with_hindi_other

79

In [56]:
ndf = new_df[new_df['Hindi_Other_Count'] == 0]
ndf

Unnamed: 0,reel_id,Count,isSafe,Codemixed,UnsafeType,Hindi_Other_Count
0,1000188341415358,1,[1],[[English]],[Safe],0
1,1006146964533997,1,[1],[[Bengali]],[Safe],0
2,1007123741459823,1,[0],[None],[Adult],0
3,1007420607202244,1,[1],[[Bengali]],[Safe],0
4,1008519870870927,1,[1],[[Bengali]],[Safe],0
...,...,...,...,...,...,...
740,997262608752854,1,[0],[[Bengali]],[Adult],0
741,997300345735398,1,[0],"[[Bengali, English]]",[Harmful],0
742,998187401354206,1,[1],[[Bengali]],[Safe],0
743,999558344609179,1,[1],[[Bengali]],[Safe],0


In [57]:
ndf.drop(columns=['Hindi_Other_Count', 'Codemixed'], inplace=True)
ndf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf.drop(columns=['Hindi_Other_Count', 'Codemixed'], inplace=True)


Unnamed: 0,reel_id,Count,isSafe,UnsafeType
0,1000188341415358,1,[1],[Safe]
1,1006146964533997,1,[1],[Safe]
2,1007123741459823,1,[0],[Adult]
3,1007420607202244,1,[1],[Safe]
4,1008519870870927,1,[1],[Safe]
...,...,...,...,...
740,997262608752854,1,[0],[Adult]
741,997300345735398,1,[0],[Harmful]
742,998187401354206,1,[1],[Safe]
743,999558344609179,1,[1],[Safe]


In [59]:
# prompt: In the dataset, isSafe, Codemixed and UnsafeType column values are list containing several values. Based on voting approach, choose and replace with a single value (not list) which came highest number of time. If a tie occur in isSafe, look for UnsafeType.. if UnsafeType is safe.. issafe will be 0. otherwise pick randomly to resolve tie.
# def most_frequent(List):
#     return max(set(List), key = List.count)
# choose the value which is more frequent in the list.
# your most frequent function is not working correctly

import pandas as pd
import json
from collections import Counter

def most_frequent(List):
    if not List:
        return None  # Handle empty lists
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

def resolve_tie(isSafe_list, unsafeType_list):
    if not isSafe_list:
        return None

    is_safe_counts = Counter(isSafe_list)
    if len(is_safe_counts) == 1:
        return isSafe_list[0]

    most_common_isSafe = is_safe_counts.most_common()
    if most_common_isSafe[0][1] == most_common_isSafe[1][1]:
      # Tie in isSafe, consider unsafeType
      unsafe_counts = Counter(unsafeType_list)
      if "Safe" in unsafe_counts and unsafe_counts["Safe"] > 0:
          return 0  # isSafe is 0 if UnsafeType has "Safe" values
      else:
          # Randomly pick from the tied values
          import random
          return random.choice([val for val, count in most_common_isSafe])
    else:
      return most_common_isSafe[0][0]

ndf['isSafe_final'] = ndf['isSafe'].apply(most_frequent)
ndf['UnsafeType_final'] = ndf['UnsafeType'].apply(most_frequent)


# Apply the tie-breaking logic
ndf['isSafe_resolved'] = ndf.apply(lambda row: resolve_tie(row['isSafe'], row['UnsafeType']), axis=1)

ndf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['isSafe_final'] = ndf['isSafe'].apply(most_frequent)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['UnsafeType_final'] = ndf['UnsafeType'].apply(most_frequent)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['isSafe_resolved'] = ndf.apply(lambda row: resolve_tie(row['isSafe'], row['Unsa

Unnamed: 0,reel_id,Count,isSafe,UnsafeType,isSafe_final,UnsafeType_final,isSafe_resolved
0,1000188341415358,1,[1],[Safe],1,Safe,1
1,1006146964533997,1,[1],[Safe],1,Safe,1
2,1007123741459823,1,[0],[Adult],0,Adult,0
3,1007420607202244,1,[1],[Safe],1,Safe,1
4,1008519870870927,1,[1],[Safe],1,Safe,1
...,...,...,...,...,...,...,...
740,997262608752854,1,[0],[Adult],0,Adult,0
741,997300345735398,1,[0],[Harmful],0,Harmful,0
742,998187401354206,1,[1],[Safe],1,Safe,1
743,999558344609179,1,[1],[Safe],1,Safe,1


In [62]:
ndf.drop(columns=['isSafe', 'UnsafeType', 'Count', 'isSafe_final'], inplace=True)
ndf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf.drop(columns=['isSafe', 'UnsafeType', 'Count', 'isSafe_final'], inplace=True)


Unnamed: 0,reel_id,UnsafeType_final,isSafe_resolved
0,1000188341415358,Safe,1
1,1006146964533997,Safe,1
2,1007123741459823,Adult,0
3,1007420607202244,Safe,1
4,1008519870870927,Safe,1
...,...,...,...
740,997262608752854,Adult,0
741,997300345735398,Harmful,0
742,998187401354206,Safe,1
743,999558344609179,Safe,1


In [64]:
# prompt: rename column name

ndf.rename(columns={'UnsafeType_final': 'UnsafeType',
                    'isSafe_resolved' : 'isSafe'}, inplace=True)
ndf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf.rename(columns={'UnsafeType_final': 'UnsafeType',


Unnamed: 0,reel_id,UnsafeType,isSafe
0,1000188341415358,Safe,1
1,1006146964533997,Safe,1
2,1007123741459823,Adult,0
3,1007420607202244,Safe,1
4,1008519870870927,Safe,1
...,...,...,...
740,997262608752854,Adult,0
741,997300345735398,Harmful,0
742,998187401354206,Safe,1
743,999558344609179,Safe,1


In [66]:
ndf['UnsafeType'].value_counts()

Unnamed: 0_level_0,count
UnsafeType,Unnamed: 1_level_1
Safe,422
Adult,181
Harmful,61
Suicidal,2


In [65]:
ndf.to_csv('final_data.csv', index=False)