# Zip File Extration

In [None]:
import tarfile
import os

tar_files_dir = 'folder-1'

parent_extracted_dir = 'folder-1'

if not os.path.exists(parent_extracted_dir):
    os.makedirs(parent_extracted_dir)

for filename in os.listdir(tar_files_dir):
    file_path = os.path.join(tar_files_dir, filename)

    if os.path.isfile(file_path) and tarfile.is_tarfile(file_path):
        tar_filename = os.path.splitext(filename)[0] 
        extracted_dir = os.path.join(parent_extracted_dir, tar_filename)

        os.makedirs(extracted_dir, exist_ok=True)

        try:
            with tarfile.open(file_path, 'r:gz') as tar: 
                tar.extractall(path=extracted_dir)

            print(f'Tar file "{file_path}" has been extracted to "{extracted_dir}"')

            os.remove(file_path)

        except tarfile.ReadError as e:
            print(f'Error extracting "{file_path}": {e}. Skipping...')
        except PermissionError as e:
            print(f'Permission error for "{file_path}": {e}. Skipping...')
        except EOFError as e:
            print(f'EOFError for "{file_path}": {e}. The file might be corrupted or incomplete. Skipping...')
    else:
        print(f'Skipping non-tar file or directory: "{file_path}"')


In [None]:
import tarfile
import os

tar_files_dir = 'folder-2'

parent_extracted_dir = 'folder-2'

if not os.path.exists(parent_extracted_dir):
    os.makedirs(parent_extracted_dir)

for filename in os.listdir(tar_files_dir):
    file_path = os.path.join(tar_files_dir, filename)

    if os.path.isfile(file_path) and tarfile.is_tarfile(file_path):
        tar_filename = os.path.splitext(filename)[0] 
        extracted_dir = os.path.join(parent_extracted_dir, tar_filename)

        os.makedirs(extracted_dir, exist_ok=True)

        try:
            with tarfile.open(file_path, 'r:gz') as tar: 
                tar.extractall(path=extracted_dir)

            print(f'Tar file "{file_path}" has been extracted to "{extracted_dir}"')

            os.remove(file_path)

        except tarfile.ReadError as e:
            print(f'Error extracting "{file_path}": {e}. Skipping...')
        except PermissionError as e:
            print(f'Permission error for "{file_path}": {e}. Skipping...')
        except EOFError as e:
            print(f'EOFError for "{file_path}": {e}. The file might be corrupted or incomplete. Skipping...')
    else:
        print(f'Skipping non-tar file or directory: "{file_path}"')


# Classification

In [None]:
import os
txt_file1 = []
for root, dirs, files in os.walk("folder-1"):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            txt_file1.append(file_path)
print(txt_file1)

In [None]:
import os
txt_file2 = []
for root, dirs, files in os.walk("folder-2"):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            txt_file2.append(file_path)
print(txt_file2)

# Data Cleaning

In [None]:
import pandas as pd

total_null_count = 0

for txt in txt_file1:
    try:
        df = pd.read_fwf(txt)
        df = df.dropna()
        total_null_count += df.isnull().sum().sum()
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Skipping file {txt} due to error: {e}")

print(total_null_count)

In [None]:
import pandas as pd

total_null_count = 0

for txt in txt_file2:
    try:
        df = pd.read_fwf(txt)
        df = df.dropna()
        total_null_count += df.isnull().sum().sum()
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Skipping file {txt} due to error: {e}")

print(total_null_count)

# TF-IDF and Cosine Similarity

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Function to tokenize text and perform lemmatization
def tokenize_text_lemmatization(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]

    # Check if all tokens are stop words
    if not tokens:
        # If all tokens are stop words, add a placeholder token to avoid empty vocabulary
        tokens = ['placeholder']

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Function to calculate TF-IDF for a list of file paths
def calculate_tfidf_for_file_list(file_paths):
    tfidf_matrices = []

    for file_path in file_paths:
        # Tokenize each file
        tokenized_texts_lemmatization = tokenize_text_lemmatization(file_path)

        # Convert tokenized texts to string for TfidfVectorizer
        tokenized_texts_str = ' '.join(tokenized_texts_lemmatization)

        # Create TfidfVectorizer
        vectorizer = TfidfVectorizer()

        # Fit and transform the tokenized texts
        tfidf_matrix = vectorizer.fit_transform([tokenized_texts_str])

        # Get feature names (terms)
        feature_names = vectorizer.get_feature_names_out()

        # Convert the TF-IDF matrix to a dense array for easier manipulation
        dense_tfidf_matrix = tfidf_matrix.todense()

        # Create a DataFrame for better visualization (optional)
        import pandas as pd
        df_tfidf = pd.DataFrame(dense_tfidf_matrix, columns=feature_names)

        tfidf_matrices.append(df_tfidf)

    return tfidf_matrices

# Calculate TF-IDF for each list of file paths
tfidf_folder1 = calculate_tfidf_for_file_list(txt_file1)
tfidf_folder2 = calculate_tfidf_for_file_list(txt_file2)


# calculate the cosine similarity between the matrices
for tfidf_matrix1 in tfidf_folder1:
    for tfidf_matrix2 in tfidf_folder2:
        similarity_matrix = cosine_similarity(tfidf_matrix1, tfidf_matrix2)
        print(f"Cosine Similarity Matrix between files in folder1 and folder2:")
        print(similarity_matrix)


In [None]:
# from pyspark.sql import SparkSession
# from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
# from pyspark.ml.linalg import DenseVector
# from pyspark.ml import Pipeline
# from pyspark.sql.functions import udf
# from pyspark.sql.types import DoubleType
# from pyspark import SparkContext

# # Initialize Spark session
# spark = SparkSession.builder.appName("TFIDFComparison").getOrCreate()
# sc = SparkContext.getOrCreate()

# # Function to read text file
# def read_text_file(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         text = file.read()
#     return text

# # UDF to perform lemmatization (you can replace it with your own lemmatization logic)
# @udf
# def lemmatize_udf(text):
#     # Your lemmatization logic here
#     return text

# # UDF to convert sparse vector to dense vector
# @udf(DenseVector)
# def sparse_to_dense(vector):
#     return DenseVector(vector.toArray())

# # Function to calculate cosine similarity
# def calculate_cosine_similarity(v1, v2):
#     dot_product = v1.dot(v2)
#     magnitude_v1 = v1.norm(2)
#     magnitude_v2 = v2.norm(2)
#     similarity = dot_product / (magnitude_v1 * magnitude_v2)
#     return similarity

# # Create DataFrame from text files
# df_folder1 = spark.createDataFrame([(file_path, read_text_file(file_path)) for file_path in txt_file1], ["path", "text"])
# df_folder2 = spark.createDataFrame([(file_path, read_text_file(file_path)) for file_path in txt_file2], ["path", "text"])

# # Tokenize, remove stopwords, and calculate TF
# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# remover = StopWordsRemover(inputCol="words", outputCol="filtered")
# vectorizer = CountVectorizer(inputCol="filtered", outputCol="tf_features")

# # Create a Pipeline
# pipeline = Pipeline(stages=[tokenizer, remover, vectorizer])
# model = pipeline.fit(df_folder1.union(df_folder2))

# # Transform the data
# df_folder1 = model.transform(df_folder1)
# df_folder2 = model.transform(df_folder2)

# # Calculate IDF
# idf = IDF(inputCol="tf_features", outputCol="idf_features")
# idf_model = idf.fit(df_folder1.union(df_folder2))
# df_folder1 = idf_model.transform(df_folder1)
# df_folder2 = idf_model.transform(df_folder2)

# # Convert sparse vectors to dense vectors
# df_folder1 = df_folder1.withColumn("dense_features", sparse_to_dense("idf_features"))
# df_folder2 = df_folder2.withColumn("dense_features", sparse_to_dense("idf_features"))

# # Calculate cosine similarity
# result = df_folder1.crossJoin(df_folder2)\
#     .withColumn("cosine_similarity", calculate_cosine_similarity("dense_features", "dense_features"))\
#     .select("path", "path", "cosine_similarity")

# # Show the result
# result.show(truncate=False)
# # 

In [1]:
import os
import hashlib

def calculate_sha256(file_path, block_size=4096):
    sha256_hash = hashlib.sha256()
    
    with open(file_path, "rb") as file:
        for byte_block in iter(lambda: file.read(block_size), b""):
            sha256_hash.update(byte_block)

    return sha256_hash.hexdigest()

def generate_file_hashes(folder_path):
    file_hashes = {}

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)

            # Calculate SHA-256 hash only if file size and modification time match
            file_stats = os.stat(file_path)
            key = (file_stats.st_size, file_stats.st_mtime)

            if key not in file_hashes:
                file_hashes[key] = calculate_sha256(file_path)

    return file_hashes

# Example usage:
source_folder = "folder-1"
destination_folder = "folder-2"

# Generate hash values for files in each folder
source_hashes = generate_file_hashes(source_folder)
destination_hashes = generate_file_hashes(destination_folder)

# Identify duplicate files
duplicate_files = {key for key in source_hashes if key in destination_hashes}

# Display duplicate files
print("Duplicate Files:")
for key in duplicate_files:
    print(f"- {key}")


Duplicate Files:
- (49270, 1613982378.0)
- (93856, 1613982348.0)
- (262, 1613982299.0)
- (1182, 1613982305.0)
- (0, 1613982299.0)
- (60, 1613982313.0)
- (923, 1613982310.0)
- (500, 1613982356.0)
- (66854, 1613982389.0)
- (123, 1613982369.0)
- (438, 1613982381.0)
- (213, 1613982299.0)
- (276150, 1613982356.0)
- (100337, 1613982336.0)
- (121, 1613982301.0)
- (1892, 1613982293.0)
- (5971, 1613982365.0)
- (93330, 1613982359.0)
- (2715, 1613982366.0)
- (106526, 1613982360.0)
- (593, 1613982296.0)
- (98452, 1613982358.0)
- (6422, 1613982386.0)
- (106222, 1613982363.0)
- (5367, 1613982335.0)
- (3716, 1613982296.0)
- (18895, 1613982321.0)
- (2855, 1613982345.0)
- (721, 1613982301.0)
- (232000, 1613982357.0)
- (122, 1613982386.0)
- (14, 1613982295.0)
- (0, 1613982397.0)
- (95933, 1613982364.0)
- (246, 1613982309.0)
- (170332, 1613982372.0)
- (15, 1613982296.0)
- (4025, 1613982354.0)
- (13847, 1613982364.0)
- (155, 1613982312.0)
- (975, 1613982310.0)
- (721, 1613982314.0)
- (98467, 1613982356.0)

In [None]:
import os
import xxhash

def calculate_hash(file_path, chunk_size=4096):
    hasher = xxhash.xxh64()
    with open(file_path, 'rb') as file:
        while True:
            data = file.read(chunk_size)
            if not data:
                break
            hasher.update(data)
    return hasher.hexdigest()

def find_duplicates(folder1, folder2):
    hash_table = {}

    # Process files in the first folder
    for root, dirs, files in os.walk(folder1):
        for file in files:
            file_path = os.path.join(root, file)
            file_hash = calculate_hash(file_path)
            hash_table[file_hash] = file_path

    # Check files in the second folder for duplicates
    duplicate_files = []
    for root, dirs, files in os.walk(folder2):
        for file in files:
            file_path = os.path.join(root, file)
            file_hash = calculate_hash(file_path)
            if file_hash in hash_table:
                duplicate_files.append((file_path, hash_table[file_hash]))

    return duplicate_files

if __name__ == "__main__":
    folder1 = "/path/to/first/folder"
    folder2 = "/path/to/second/folder"

    duplicates = find_duplicates(folder1, folder2)

    if duplicates:
        print("Duplicate files found:")
        for file1, file2 in duplicates:
            print(f"{file1} and {file2}")
    else:
        print("No duplicate files found.")
