In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from lightgbm import early_stopping,log_evaluation,LGBMClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from textblob import TextBlob #for sentiment analysis

In [22]:

train = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")

In [23]:
def sentiment_analysis(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

# Apply sentiment analysis to each column
train['prompt_sentiment'] = train['prompt'].apply(sentiment_analysis)
train['response_a_sentiment'] = train['response_a'].apply(sentiment_analysis)
train['response_b_sentiment'] = train['response_b'].apply(sentiment_analysis)
test['prompt_sentiment'] = test['prompt'].apply(sentiment_analysis)
test['response_a_sentiment'] = test['response_a'].apply(sentiment_analysis)
test['response_b_sentiment'] = test['response_b'].apply(sentiment_analysis)

In [24]:
# Here I compute some features
def compute_feats(df):
    for col in ["response_a","response_b","prompt"]:
        # response lenght is a key factor when choosing between two responses
        df[f"{col}_len"]=df[f"{col}"].str.len()

        # Some characters counting features 
        df[f"{col}_spaces"]=df[f"{col}"].str.count("\s")
        df[f"{col}_punct"]=df[f"{col}"].str.count(",|\.|!")
        df[f"{col}_question_mark"]=df[f"{col}"].str.count("\?")
        df[f"{col}_quot"]=df[f"{col}"].str.count("'|\"")
        df[f"{col}_formatting_chars"]=df[f"{col}"].str.count("\*|\_")
        df[f"{col}_math_chars"]=df[f"{col}"].str.count("\-|\+|\=")
        df[f"{col}_curly_open"]=df[f"{col}"].str.count("\{")
        df[f"{col}_curly_close"]=df[f"{col}"].str.count("}")
        df[f"{col}_round_open"]=df[f"{col}"].str.count("\(")
        df[f"{col}_round_close"]=df[f"{col}"].str.count("\)")
        df[f"{col}_special_chars"]=df[f"{col}"].str.count("\W")
        df[f"{col}_digits"]=df[f"{col}"].str.count("\d")>0
        df[f"{col}_lower"]=df[f"{col}"].str.count("[a-z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_upper"]=df[f"{col}"].str.count("[A-Z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_chinese"]=df[f"{col}"].str.count(r'[\u4e00-\u9fff]+').astype("float32")/df[f"{col}_len"]

        # Feature that show how balanced are curly and round brackets
        df[f"{col}_round_balance"]=df[f"{col}_round_open"]-df[f"{col}_round_close"]
        df[f"{col}_curly_balance"]=df[f"{col}_curly_open"]-df[f"{col}_curly_close"]

        # Feature that tells if the string json is present somewhere (e.g. asking a json response or similar)
        # This for example could be expanded also to yaml, but analyses on train set are required to see if enough data is present for this to be really useful
        df[f"{col}_json"]=df[f"{col}"].str.lower().str.count("json")
    return df
    
train=compute_feats(train)
test=compute_feats(test)

In [25]:
vectorizer_char = TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(1,2), max_features=50000)
vectorizer_word = TfidfVectorizer(sublinear_tf=True, analyzer='word', min_df=3)
preprocessor = ColumnTransformer(
    transformers=[
        ('prompt_feats', FeatureUnion([
            ('prompt_char', vectorizer_char),
            ('prompt_word', vectorizer_word)
        ]), 'prompt'),
        ('response_a_feats', FeatureUnion([
            ('response_a_char', vectorizer_char),
            ('response_a_word', vectorizer_word)
        ]), 'response_a'),
        ('response_b_feats', FeatureUnion([
            ('response_b_char', vectorizer_char),
            ('response_b_word', vectorizer_word)
        ]), 'response_b')
    ]
)
train_feats = preprocessor.fit_transform(train[["response_a","response_b","prompt"]])
test_feats = preprocessor.transform(test[["response_a","response_b","prompt"]])

In [26]:
feats=list(train.columns)[8:]
train["winner"]=(train["winner"]=="model_a").astype("int")
X_train=train[feats]
y_train=train["winner"]

In [27]:
from sklearn.metrics import accuracy_score
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)



# Create the model with early stopping
model = LGBMClassifier(n_estimators=1000,  # Set a large number for early stopping
                        learning_rate=0.1,
                        early_stopping_rounds=10,)  # Stop if no improvement in 15 rounds

# Train the model
model.fit(X_train, y_train,eval_set=[(X_val, y_val)], eval_metric='binary_logloss')

[LightGBM] [Info] Number of positive: 20420, number of negative: 20753
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10061
[LightGBM] [Info] Number of data points in the train set: 41173, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495956 -> initscore=-0.016176
[LightGBM] [Info] Start training from score -0.016176
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[61]	valid_0's binary_logloss: 0.656859


In [28]:
from sklearn.metrics import precision_score

y_val_pred = model.predict(X_val)

# 计算准确率
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
# 输出准确率
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation precision: {precision:.4f}")

Validation Accuracy: 0.5963
Validation precision: 0.5852


In [29]:
X_train

Unnamed: 0,prompt_sentiment,response_a_sentiment,response_b_sentiment,response_a_len,response_a_spaces,response_a_punct,response_a_question_mark,response_a_quot,response_a_formatting_chars,response_a_math_chars,...,prompt_round_open,prompt_round_close,prompt_special_chars,prompt_digits,prompt_lower,prompt_upper,prompt_chinese,prompt_round_balance,prompt_curly_balance,prompt_json
29897,0.000000,0.000000,0.000000,651,88,42,0,0,32,0,...,0,0,25,True,0.000000,0.000000,0.102041,0,0,0
32393,0.000000,0.150000,0.000000,1716,300,28,0,0,20,0,...,0,0,8,False,0.735294,0.000000,0.000000,0,0,0
37086,0.000000,-0.244444,0.000000,836,131,14,0,0,0,0,...,0,0,4,False,0.578947,0.157895,0.000000,0,0,0
44057,0.000000,0.000000,0.000000,503,86,18,3,0,20,2,...,0,0,149,True,0.009021,0.001289,0.000000,0,0,0
43482,0.000000,0.390000,0.298052,199,35,2,1,2,0,0,...,0,0,7,False,0.837209,0.000000,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.000000,0.000000,0.000000,1087,196,36,0,0,0,1,...,0,0,5,True,0.764706,0.000000,0.000000,0,0,0
44732,0.000000,-0.001786,0.070181,1372,245,28,0,2,40,17,...,0,0,6,False,0.837838,0.000000,0.000000,0,0,0
38158,0.059951,0.060218,-0.016673,1471,205,19,0,1,0,0,...,1,1,709,False,0.812605,0.017493,0.000000,0,0,0
860,0.333333,0.054167,0.040899,1403,222,23,0,3,0,4,...,0,0,24,False,0.775000,0.025000,0.000000,0,0,0
