In [2]:
import json
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
def get_prompts_answers(json_data):
    prompts = []
    answers = []
    
    for source in json_data['Sources']:
        for convo in source['ChatgptSharing']:
            if 'Conversations' in convo:
                for c in convo['Conversations']:
                    prompts.append(c['Prompt'])
                    answers.append(c['Answer'])
                    
    return prompts, answers

In [4]:
def write_to_csv(prompts, answers, csv_file):
    with open(csv_file, 'w', encoding='utf-8') as f: 
        writer = csv.writer(f)  
        writer.writerow(['Prompt', 'Answer'])
            
        for p, a in zip(prompts, answers):
            p = p.encode('utf-8', 'ignore').decode('utf-8') 
            a = a.encode('utf-8', 'ignore').decode('utf-8')
            writer.writerow([p, a])


In [5]:
# Usage
with open('C:/Users/bkeer/Downloads/discussion_sharings.json') as f:
    data = json.load(f)
    
prompts, answers = get_prompts_answers(data) 

csv_file = 'prompts_answers.csv'
write_to_csv(prompts, answers, csv_file)


In [6]:
# Read the CSV
df = pd.read_csv('prompts_answers.csv')

In [9]:
import nltk
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bkeer\AppData\Roaming\nltk_data...


In [10]:
df['Sentiment'] = df['Answer'].apply(lambda x: sid.polarity_scores(str(x))['compound'])

In [11]:
df['ConversationLength'] = df['Answer'].apply(lambda x: len(str(x)))


In [12]:
df['SentimentLabel'] = df['Sentiment'].apply(lambda x: 1 if x >= 0.1 else 0)

In [13]:
df['UserExperience'] = df['Sentiment'] * df['ConversationLength']


In [15]:
import numpy as np
tfidf = TfidfVectorizer()
prompts_tfidf = tfidf.fit_transform(df['Prompt'])
responses_tfidf = tfidf.transform(df['Answer'])
features = np.hstack((prompts_tfidf.toarray(), responses_tfidf.toarray(),
                      df['Sentiment'].values.reshape(-1, 1),
                      df['ConversationLength'].values.reshape(-1, 1),
                      df['UserExperience'].values.reshape(-1, 1)))

In [16]:
sentiment_features = df['Sentiment'].values.reshape(-1, 1)


In [17]:
sentiment_labels = df['SentimentLabel'].values

In [18]:
X_sentiment_train, X_sentiment_test, y_sentiment_train, y_sentiment_test = train_test_split(
    sentiment_features, sentiment_labels, test_size=0.2, random_state=42)

In [19]:
sentiment_model = DummyClassifier(strategy='uniform')  # 'uniform' means random predictions
sentiment_model.fit(X_sentiment_train, y_sentiment_train)

In [20]:
sentiment_predictions = sentiment_model.predict(X_sentiment_test)

In [21]:
sentiment_accuracy = accuracy_score(y_sentiment_test, sentiment_predictions)
sentiment_report = classification_report(y_sentiment_test, sentiment_predictions)

In [22]:

print(f"Sentiment Analysis ")
print(f"Accuracy: {sentiment_accuracy}")
print("Classification Report:\n", sentiment_report)

Sentiment Analysis 
Accuracy: 0.47368421052631576
Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.50      0.29         8
           1       0.78      0.47      0.58        30

    accuracy                           0.47        38
   macro avg       0.49      0.48      0.43        38
weighted avg       0.66      0.47      0.52        38



In [23]:
# Extract features and labels for the original classification task
import numpy as np
original_labels = df['SentimentLabel'].values  # Assuming SentimentLabel is the relevant column

# TF-IDF vectorization for prompts and responses
tfidf = TfidfVectorizer()
prompts_tfidf = tfidf.fit_transform(df['Prompt'])
responses_tfidf = tfidf.transform(df['Answer'])
features = np.hstack((prompts_tfidf.toarray(), responses_tfidf.toarray()))

# Train-test split for the original task
X_original_train, X_original_test, y_original_train, y_original_test = train_test_split(
    features, original_labels, test_size=0.2, random_state=42)


In [24]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the random forest classifier for the original task
original_model = RandomForestClassifier(random_state=42)  # Using RandomForestClassifier
original_model.fit(X_original_train, y_original_train)

In [25]:
original_predictions = original_model.predict(X_original_test)