In [None]:
# load.py
import sys
import pandas as pd
import subprocess

def load_data(file_path):
    df = pd.read_csv(file_path)
    output_path = '/home/doc-bd-a1/twitterdata.csv'
    df.to_csv(output_path, index=False)
    print(f"Data loaded and saved to {output_path}")
    return output_path

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 load.py <dataset-path>")
        sys.exit(1)

    dataset_path = sys.argv[1]
    processed_path = load_data(dataset_path)

    subprocess.run(['python3', '/home/doc-bd-a1/dpre.py', processed_path])


In [None]:
# dpre.py
import sys
import pandas as pd

def preprocess_data(file_path):
    df = pd.read_csv(file_path)

    df = df.dropna()
    df = df.drop_duplicates()

    e_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    df['emotion'] = df['label'].map(e_map)
    df['txt_len'] = df['text'].apply(len)

    df = df.drop(columns=['label'])

    bins = [0, 50, 100, float('inf')]
    labels = ['short', 'medium', 'long']
    df['lenCat.'] = pd.cut(df['txt_len'], bins=bins, labels=labels)

    output_path = '/home/doc-bd-a1/res_dpre.csv'
    df.to_csv(output_path, index=False)
    print(f"Data preprocessed and saved to {output_path}")
    return output_path

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 dpre.py <file-path>")
        sys.exit(1)

    input_path = sys.argv[1]
    processed_path = preprocess_data(input_path)

    subprocess.run(['python3', '/home/doc-bd-a1/eda.py', processed_path])


In [None]:
# eda.py
import sys
import pandas as pd

def eda_analysis(file_path):
    df = pd.read_csv(file_path)

    insights = [
        f"Total records: {len(df)}",
        f"Average length of tweets: {df['text'].str.len().mean()}",
        f"Most common emotion: {df['emotion'].mode()[0]}"
    ]

    unique_tweets = df['text'].nunique()
    insights.append(f"Unique tweets: {unique_tweets}")

    min_len = df['text'].str.len().min()
    max_len = df['text'].str.len().max()
    median_len = df['text'].str.len().median()
    insights.extend([
        f"Min length of tweet: {min_len}",
        f"Max length of tweet: {max_len}",
        f"Median length of tweet: {median_len}"
    ])

    for i, insight in enumerate(insights):
        file_path = f'/home/doc-bd-a1/eda-in-{i+1}.txt'
        with open(file_path, 'w') as f:
            f.write(insight)
        print(f"Saved insight {i+1} to {file_path}")

    print("EDA analysis completed.")
    return file_path

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 eda.py <file-path>")
        sys.exit(1)

    input_path = sys.argv[1]
    eda_analysis(input_path)

    subprocess.run(['python3', '/home/doc-bd-a1/vis.py', input_path])


In [None]:
# vis.py
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess

def visualize(file_path):
    df = pd.read_csv(file_path)

    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='emotion', palette='Set2')
    output_path = '/home/doc-bd-a1/vis.png'
    plt.savefig(output_path)
    print(f"Visualization saved to {output_path}")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 vis.py <file-path>")
        sys.exit(1)

    input_path = sys.argv[1]
    visualize(input_path)

    subprocess.run(['python3', '/home/doc-bd-a1/model.py', input_path])


In [None]:
# model.py
import sys
import pandas as pd
from sklearn.cluster import KMeans

def run_kmeans(file_path):
    df = pd.read_csv(file_path)

    kmeans = KMeans(n_clusters=6, random_state=0)
    df['cluster'] = kmeans.fit_predict(df.select_dtypes(include=[float, int]))

    cluster_counts = df['cluster'].value_counts()
    output_path = '/home/doc-bd-a1/k.txt'
    with open(output_path, 'w') as f:
        f.write(cluster_counts.to_string())
    print(f"K-means results saved to {output_path}")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 model.py <file-path>")
        sys.exit(1)

    input_path = sys.argv[1]
    run_kmeans(input_path)
