# Anfis 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import sys
sys.path.insert(0, 'code/')
from anfis import ANFIS, predict
import membershipfunction

In [2]:
def doAnfis(X_train, X_test, Y_train, Y_test):
    # Define the membership functions for each feature
    mf = [
        [
            ['gaussmf', {'mean': 0, 'sigma': 0.2}],  # No exclamations
            ['gaussmf', {'mean': 0.65, 'sigma': 0.6}]  # Some exclamations (e.g., mean and beyond)
        ],
        [
            ['gaussmf', {'mean': 0, 'sigma': 0.15}],  # No questions
            ['gaussmf', {'mean': 1, 'sigma': 0.5}]    # Some questions (mean or slightly beyond)
        ],
        [
            ['gaussmf', {'mean': 0.5, 'sigma': 0.15}],  # Low objectivity
            ['gaussmf', {'mean': 0.75, 'sigma': 0.1}],  # Medium objectivity
            ['gaussmf', {'mean': 0.9, 'sigma': 0.1}]    # High objectivity
        ],
        [
            ['gaussmf', {'mean': 0, 'sigma': 0.1}],  # No joy
            ['gaussmf', {'mean': 0.3, 'sigma': 0.2}]  # Some joy
        ],
        [
            ['gaussmf', {'mean': 0, 'sigma': 0.1}],  # Neutral/No negativity
            ['gaussmf', {'mean': 0.5, 'sigma': 0.2}],  # Some negativity
            ['gaussmf', {'mean': 1, 'sigma': 0.1}]     # Strong negativity
        ],
        [
            ['gaussmf', {'mean': 0, 'sigma': 0.1}],  # Neutral/No positivity
            ['gaussmf', {'mean': 0.5, 'sigma': 0.2}],  # Some positivity
            ['gaussmf', {'mean': 1, 'sigma': 0.1}]     # Strong positivity
        ]
    ]
    
    # Initialize Membership functions and ANFIS
    mfc = membershipfunction.MemFuncs(mf)
    anf = ANFIS(X_train, Y_train, mfc)
    
    # Train the ANFIS model
    anf.trainHybridJangOffLine(epochs=100)
    
    # Make predictions on the test set
    predictions = predict(anf, X_test)
    
    # Post-process predictions to match Y_train/Y_test dimensions
    # Rounding predictions to the nearest integer as the output should be categorical
    predictions_rounded = np.rint(predictions).astype(int).flatten()
    
    # Accuracy calculation
    accuracy = accuracy_score(Y_test, predictions_rounded)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Optionally, plot errors and results
    anf.plotErrors()
    anf.plotResults()
    
    return anf

In [4]:
processed_data = pd.read_csv('anfis_input.csv')
print("Data loaded")

# Features to keep for the fuzzy system
to_keep = ['exclamation_score', 'question_score', 'obj_score', 'joy_score', 'vader_neg', 'vader_pos']

# Subset the dataframe for input features
fuzzy_data = processed_data[to_keep]

# Mapping the target variable (Emotion) to numeric values
mapping_dict = {value: index for index, value in enumerate(processed_data['Emotion'].unique())}
processed_data['Emotion_mapped'] = processed_data['Emotion'].map(mapping_dict)

# Input (X) and Target (Y) variables
X = fuzzy_data.values
Y = processed_data['Emotion_mapped'].values

# Split the data into training and testing sets using train_test_split with stratification
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Train and evaluate ANFIS
anfis_model = doAnfis(X_train, X_test, Y_train, Y_test)

Data loaded


Processing:   0%|          | 0/100 [00:00<?, ?it/s]

current error: 2708.996469066938


Processing:   1%|          | 1/100 [04:42<7:46:26, 282.69s/it]

current error: 2687.980977459974


Processing:   2%|▏         | 2/100 [09:29<7:45:27, 284.98s/it]

current error: 2693.9780846194003


Processing:   3%|▎         | 3/100 [14:11<7:39:01, 283.94s/it]

current error: 2701.4453814332446


Processing:   4%|▍         | 4/100 [18:55<7:33:52, 283.67s/it]

current error: 2711.192356920275


Processing:   5%|▌         | 5/100 [23:30<7:24:32, 280.77s/it]

current error: 2724.497382067581


Processing:   6%|▌         | 6/100 [28:10<7:19:09, 280.31s/it]

current error: 2743.6681241223523


Processing:   7%|▋         | 7/100 [32:47<7:13:08, 279.45s/it]

current error: 2745.2845809182318


Processing:   8%|▊         | 8/100 [37:24<7:07:14, 278.64s/it]

current error: 2743.2930746118354


Processing:   9%|▉         | 9/100 [42:02<7:02:06, 278.31s/it]

current error: 2741.3000813264125


Processing:  10%|█         | 10/100 [46:41<6:57:39, 278.44s/it]

current error: 2739.285930684785


Processing:  11%|█         | 11/100 [51:18<6:52:24, 278.03s/it]

current error: 2737.0066445799457


Processing:  12%|█▏        | 12/100 [55:51<6:45:49, 276.70s/it]

current error: 2734.349981416868


Processing:  13%|█▎        | 13/100 [1:00:37<6:45:17, 279.51s/it]

current error: 2731.129609147484


Processing:  14%|█▍        | 14/100 [1:05:32<6:47:05, 284.02s/it]

current error: 2727.050998951474


Processing:  15%|█▌        | 15/100 [1:10:45<6:54:55, 292.89s/it]

current error: 2721.686679288786


Processing:  16%|█▌        | 16/100 [1:16:22<7:08:26, 306.03s/it]

current error: 2714.497985444448


Processing:  17%|█▋        | 17/100 [1:21:17<6:58:43, 302.70s/it]

current error: 2704.9381859241294


Processing:  18%|█▊        | 18/100 [1:26:08<6:49:09, 299.38s/it]

current error: 2692.7019347730197


Processing:  19%|█▉        | 19/100 [1:31:08<6:44:24, 299.56s/it]

current error: 2678.5927751649087


Processing:  20%|██        | 20/100 [1:37:40<7:16:19, 327.24s/it]

current error: 2664.3395413785943


Processing:  21%|██        | 21/100 [1:43:54<7:29:04, 341.08s/it]

current error: 2655.3421885596354


Processing:  22%|██▏       | 22/100 [1:49:58<7:32:35, 348.14s/it]

current error: 2684.178360348643


Based on the summary statistics we created for each of the six features, we can tailor the membership functions (MFs) for the fuzzy system. The goal is to capture the distribution of each feature effectively by setting appropriate mean and sigma values for Gaussian membership functions.


	    exclamation_score	question_score	obj_score	joy_score	vader_neg	vader_pos
count	1110.000000	        1110.000000	    1110.000000	1110.000000	1110.000000	1110.000000
mean	0.065042	        0.042836	    0.764637	0.044619	0.086072	0.123200
std	    0.231353	        0.184856	    0.146033	0.085424	0.119278	0.137306
min	    0.000000	        0.000000	    0.000000	0.000000	0.000000	0.000000
25%	    0.000000	        0.000000	    0.680783	0.000000	0.000000	0.000000
50%	    0.000000	        0.000000	    0.775212	0.000000	0.046000	0.099000
75%	    0.000000	        0.013289	    0.861111	0.064516	0.131000	0.177750
max	    3.000000	        4.000000	    1.000000	1.000000	1.000000	1.000000


General Approach:
Mean (mean): Set based on typical values such as quartiles or based on the distribution's central tendency (e.g., mean or median).
Sigma (sigma): Set based on the spread of the data, using standard deviation or a fraction of the feature range.


1. exclamation_score
Range: [0, 3] with most data around 0 (75% of the data has a value of 0).
Mean: You might consider separating between "no exclamations" and "has exclamations". Set one MF around 0 for the bulk of the data and another around the mean or a higher value for rare instances of multiple exclamations.
Sigma: Based on the standard deviation (0.23), choose a moderate width for the Gaussians.

2. question_score
Range: [0, 4], with most data around 0 (median and 75% are 0).
Mean: Similar to exclamation_score, most data is 0. So, you can have one MF around 0 and another around the mean or a larger value.
Sigma: The standard deviation is 0.18, so the spread should reflect this.

3. obj_score
Range: [0, 1], with mean 0.76, most data between 0.68 and 0.86.
Mean: Since this feature seems to be concentrated in the higher range (close to 1), you could have MFs representing low, medium, and high scores.
Sigma: With a standard deviation of 0.15, MFs should have moderate overlap.

4. joy_score
Range: [0, 1], with most data clustered around 0 (50% of the data is 0, mean is 0.044).
Mean: You can have one MF for "no joy" around 0 and another for "presence of joy" closer to 1.
Sigma: Standard deviation is 0.085, so use small spread.

5. vader_neg (Negative sentiment)
Range: [0, 1], with most data concentrated at 0, but there's a small tail up to 1.
Mean: Have one MF around 0 for neutral sentiment, and another for stronger negative sentiment around 1.
Sigma: Standard deviation is 0.12, so a relatively narrow spread can be used.

6. vader_pos (Positive sentiment)
Range: [0, 1], mean is 0.12, with most data below 0.18.
Mean: Have one MF for neutral sentiment around 0, and another for positive sentiment closer to 1.
Sigma: Similar to vader_neg, standard deviation is 0.14, so a narrow spread can be used.

Explanation:
exclamation_score and question_score: Since most values are 0, we use two MFs to separate "no exclamation/question" from "some exclamations/questions".
obj_score: A higher range, and since values cluster around 0.76, we have three MFs representing low, medium, and high objectivity.
joy_score, vader_neg, and vader_pos: These use two or three MFs to represent "none", "some", and "strong" joy/negativity/positivity.
This setup gives a good balance between capturing the typical range and outliers in your dataset.