# Quran Frequency Comparison
This notebook merges two Quran CSV files and compares their frequency columns.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np

## Step 1: Load the CSV files

In [2]:
# Load quran_ayahs CSV file with UTF-8 encoding to preserve Arabic text
ayahs_df = pd.read_csv('/kaggle/input/quran-csv-builder-updated-v4/quran_ayahs(updated_4).csv', encoding='utf-8')
print("Quran Ayahs shape:", ayahs_df.shape)
print("\nFirst 5 rows of Quran Ayahs:")
display(ayahs_df.head())

Quran Ayahs shape: (6236, 9)

First 5 rows of Quran Ayahs:


Unnamed: 0,serial_no,surah_no,ayah_no,ayah,frequency_proper_noun,label,length,tokens,word_count
0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,1,,38,بِسۡمِ | ٱللَّهِ | ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,4
1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,1,,37,ٱلۡحَمۡدُ | لِلَّهِ | رَبِّ | ٱلۡعَٰلَمِينَ,4
2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,0,,23,ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,2
3,4,1,4,مَٰلِكِ يَوۡمِ ٱلدِّينِ,0,,23,مَٰلِكِ | يَوۡمِ | ٱلدِّينِ,3
4,5,1,5,إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ,0,,40,إِيَّاكَ | نَعۡبُدُ | وَإِيَّاكَ | نَسۡتَعِينُ,4


In [3]:
# Load quran_allah_reordered CSV file with UTF-8 encoding
reordered_df = pd.read_csv('/kaggle/input/datasets/axha241419/corpus-quran-comfrequencies-added-to-original/quran_allah_reordered (1).csv', encoding='utf-8')
print("Quran Allah Reordered shape:", reordered_df.shape)
print("\nFirst 5 rows of Quran Allah Reordered:")
display(reordered_df.head())

Quran Allah Reordered shape: (6236, 4)

First 5 rows of Quran Allah Reordered:


Unnamed: 0,surah_number,verse_number,frequency,word_locations
0,1,1,1,[2]
1,1,2,1,[2]
2,1,3,0,[]
3,1,4,0,[]
4,1,5,0,[]


## Step 2: Merge the dataframes

In [4]:
# Rename columns in reordered_df for clarity before merging
reordered_df_renamed = reordered_df.rename(columns={
    'frequency': 'frequency_reordered',
    'word_locations': 'word_locations_reordered'
})

# Merge the dataframes on surah_no/surah_number and ayah_no/verse_number
merged_df = ayahs_df.merge(
    reordered_df_renamed[['surah_number', 'verse_number', 'frequency_reordered', 'word_locations_reordered']],
    left_on=['surah_no', 'ayah_no'],
    right_on=['surah_number', 'verse_number'],
    how='left'
)

print("Merged dataframe shape:", merged_df.shape)
print("\nFirst 10 rows of merged data:")
display(merged_df.head(10))

Merged dataframe shape: (6236, 13)

First 10 rows of merged data:


Unnamed: 0,serial_no,surah_no,ayah_no,ayah,frequency_proper_noun,label,length,tokens,word_count,surah_number,verse_number,frequency_reordered,word_locations_reordered
0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,1,,38,بِسۡمِ | ٱللَّهِ | ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,4,1,1,1,[2]
1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,1,,37,ٱلۡحَمۡدُ | لِلَّهِ | رَبِّ | ٱلۡعَٰلَمِينَ,4,1,2,1,[2]
2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,0,,23,ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,2,1,3,0,[]
3,4,1,4,مَٰلِكِ يَوۡمِ ٱلدِّينِ,0,,23,مَٰلِكِ | يَوۡمِ | ٱلدِّينِ,3,1,4,0,[]
4,5,1,5,إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ,0,,40,إِيَّاكَ | نَعۡبُدُ | وَإِيَّاكَ | نَسۡتَعِينُ,4,1,5,0,[]
5,6,1,6,ٱهۡدِنَا ٱلصِّرَٰطَ ٱلۡمُسۡتَقِيمَ,0,,34,ٱهۡدِنَا | ٱلصِّرَٰطَ | ٱلۡمُسۡتَقِيمَ,3,1,6,0,[]
6,7,1,7,صِرَٰطَ ٱلَّذِينَ أَنۡعَمۡتَ عَلَيۡهِمۡ غَيۡرِ...,0,,90,صِرَٰطَ | ٱلَّذِينَ | أَنۡعَمۡتَ | عَلَيۡهِمۡ ...,9,1,7,0,[]
7,8,2,1,الٓمٓ,0,,5,الٓمٓ,1,2,1,0,[]
8,9,2,2,ذَٰلِكَ ٱلۡكِتَٰبُ لَا رَيۡبَۛ فِيهِۛ هُدٗى لّ...,0,,59,ذَٰلِكَ | ٱلۡكِتَٰبُ | لَا | رَيۡبَۛ | فِيهِۛ ...,7,2,2,0,[]
9,10,2,3,ٱلَّذِينَ يُؤۡمِنُونَ بِٱلۡغَيۡبِ وَيُقِيمُونَ...,0,,92,ٱلَّذِينَ | يُؤۡمِنُونَ | بِٱلۡغَيۡبِ | وَيُقِ...,8,2,3,0,[]


## Step 3: Create comparison column

In [5]:
# Create a comparison column that shows 'error' where frequencies don't match
# Otherwise leave it empty
def compare_frequencies(row):
    if pd.isna(row['frequency_reordered']):
        return 'error - no match found'
    elif row['frequency_proper_noun'] != row['frequency_reordered']:
        return 'error'
    else:
        return ''

merged_df['frequency_comparison'] = merged_df.apply(compare_frequencies, axis=1)

print("Comparison column created!")
print("\nFirst 10 rows with comparison:")
display(merged_df[['serial_no', 'surah_no', 'ayah_no', 'frequency_proper_noun', 
                    'frequency_reordered', 'frequency_comparison']].head(10))

Comparison column created!

First 10 rows with comparison:


Unnamed: 0,serial_no,surah_no,ayah_no,frequency_proper_noun,frequency_reordered,frequency_comparison
0,1,1,1,1,1,
1,2,1,2,1,1,
2,3,1,3,0,0,
3,4,1,4,0,0,
4,5,1,5,0,0,
5,6,1,6,0,0,
6,7,1,7,0,0,
7,8,2,1,0,0,
8,9,2,2,0,0,
9,10,2,3,0,0,


## Step 4: Check for errors

In [6]:
# Count how many errors we have
error_count = (merged_df['frequency_comparison'] != '').sum()
total_rows = len(merged_df)

print(f"Total rows: {total_rows}")
print(f"Rows with errors: {error_count}")
print(f"Rows matching: {total_rows - error_count}")
print(f"\nPercentage of errors: {(error_count/total_rows)*100:.2f}%")

Total rows: 6236
Rows with errors: 7
Rows matching: 6229

Percentage of errors: 0.11%


In [7]:
# Show rows with errors (if any)
if error_count > 0:
    print("\nRows with frequency mismatches:")
    errors_df = merged_df[merged_df['frequency_comparison'] != ''][[
        'serial_no', 'surah_no', 'ayah_no', 'ayah', 
        'frequency_proper_noun', 'frequency_reordered', 'frequency_comparison'
    ]]
    display(errors_df)
else:
    print("\nNo frequency mismatches found! All frequencies match perfectly.")


Rows with frequency mismatches:


Unnamed: 0,serial_no,surah_no,ayah_no,ayah,frequency_proper_noun,frequency_reordered,frequency_comparison
318,319,3,26,قُلِ ٱللَّهُمَّ مَٰلِكَ ٱلۡمُلۡكِ تُؤۡتِي ٱلۡم...,1,0,error
782,783,5,114,قَالَ عِيسَى ٱبۡنُ مَرۡيَمَ ٱللَّهُمَّ رَبَّنَ...,1,0,error
1191,1192,8,32,وَإِذۡ قَالُواْ ٱللَّهُمَّ إِن كَانَ هَٰذَا هُ...,1,0,error
1373,1374,10,10,دَعۡوَىٰهُمۡ فِيهَا سُبۡحَٰنَكَ ٱللَّهُمَّ وَت...,2,1,error
1422,1423,10,59,قُلۡ أَرَءَيۡتُم مَّآ أَنزَلَ ٱللَّهُ لَكُم م...,2,3,error
3217,3218,27,59,قُلِ ٱلۡحَمۡدُ لِلَّهِ وَسَلَٰمٌ عَلَىٰ عِبَاد...,1,2,error
4103,4104,39,46,قُلِ ٱللَّهُمَّ فَاطِرَ ٱلسَّمَٰوَٰتِ وَٱلۡأَر...,1,0,error


## Step 5: Reorder columns and prepare final output

In [8]:
# Drop the redundant surah_number and verse_number columns from merge
final_df = merged_df.drop(columns=['surah_number', 'verse_number'])

# Reorder columns to have new columns right after the original frequency column
# Get the position of frequency_proper_noun
cols = final_df.columns.tolist()
freq_idx = cols.index('frequency_proper_noun')

# Insert new columns right after frequency_proper_noun
new_order = (
    cols[:freq_idx+1] + 
    ['frequency_reordered', 'frequency_comparison'] + 
    [col for col in cols[freq_idx+1:] if col not in ['frequency_reordered', 'frequency_comparison']]
)

final_df = final_df[new_order]

print("Final dataframe prepared!")
print("\nColumn order:")
for i, col in enumerate(final_df.columns, 1):
    print(f"{i}. {col}")

print("\nFirst 10 rows of final data:")
display(final_df.head(10))

Final dataframe prepared!

Column order:
1. serial_no
2. surah_no
3. ayah_no
4. ayah
5. frequency_proper_noun
6. frequency_reordered
7. frequency_comparison
8. label
9. length
10. tokens
11. word_count
12. word_locations_reordered

First 10 rows of final data:


Unnamed: 0,serial_no,surah_no,ayah_no,ayah,frequency_proper_noun,frequency_reordered,frequency_comparison,label,length,tokens,word_count,word_locations_reordered
0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,1,1,,,38,بِسۡمِ | ٱللَّهِ | ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,4,[2]
1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,1,1,,,37,ٱلۡحَمۡدُ | لِلَّهِ | رَبِّ | ٱلۡعَٰلَمِينَ,4,[2]
2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,0,0,,,23,ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,2,[]
3,4,1,4,مَٰلِكِ يَوۡمِ ٱلدِّينِ,0,0,,,23,مَٰلِكِ | يَوۡمِ | ٱلدِّينِ,3,[]
4,5,1,5,إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ,0,0,,,40,إِيَّاكَ | نَعۡبُدُ | وَإِيَّاكَ | نَسۡتَعِينُ,4,[]
5,6,1,6,ٱهۡدِنَا ٱلصِّرَٰطَ ٱلۡمُسۡتَقِيمَ,0,0,,,34,ٱهۡدِنَا | ٱلصِّرَٰطَ | ٱلۡمُسۡتَقِيمَ,3,[]
6,7,1,7,صِرَٰطَ ٱلَّذِينَ أَنۡعَمۡتَ عَلَيۡهِمۡ غَيۡرِ...,0,0,,,90,صِرَٰطَ | ٱلَّذِينَ | أَنۡعَمۡتَ | عَلَيۡهِمۡ ...,9,[]
7,8,2,1,الٓمٓ,0,0,,,5,الٓمٓ,1,[]
8,9,2,2,ذَٰلِكَ ٱلۡكِتَٰبُ لَا رَيۡبَۛ فِيهِۛ هُدٗى لّ...,0,0,,,59,ذَٰلِكَ | ٱلۡكِتَٰبُ | لَا | رَيۡبَۛ | فِيهِۛ ...,7,[]
9,10,2,3,ٱلَّذِينَ يُؤۡمِنُونَ بِٱلۡغَيۡبِ وَيُقِيمُونَ...,0,0,,,92,ٱلَّذِينَ | يُؤۡمِنُونَ | بِٱلۡغَيۡبِ | وَيُقِ...,8,[]


## Step 6: Save the result to a new CSV file

In [9]:
# Save to a new CSV file with UTF-8 encoding to preserve Arabic text
output_file = '/kaggle/working/quran_ayahs_with_freq_comparison_3.csv'
final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"✅ File saved successfully to: {output_file}")
print(f"\nTotal rows saved: {len(final_df)}")

✅ File saved successfully to: /kaggle/working/quran_ayahs_with_freq_comparison_3.csv

Total rows saved: 6236


## Summary Statistics

In [10]:
# Print summary statistics
print("=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"Total verses in Quran Ayahs file: {len(ayahs_df)}")
print(f"Total verses in Allah Reordered file: {len(reordered_df)}")
print(f"Total verses in merged file: {len(final_df)}")
print(f"\nFrequency comparison:")
print(f"  - Matching frequencies: {(final_df['frequency_comparison'] == '').sum()}")
print(f"  - Mismatching frequencies: {(final_df['frequency_comparison'] == 'error').sum()}")
print(f"  - No match found: {(final_df['frequency_comparison'] == 'error - no match found').sum()}")
print("=" * 60)

SUMMARY STATISTICS
Total verses in Quran Ayahs file: 6236
Total verses in Allah Reordered file: 6236
Total verses in merged file: 6236

Frequency comparison:
  - Matching frequencies: 6229
  - Mismatching frequencies: 7
  - No match found: 0
